12345678910111213141516171819202122232425262728293031323334 |
- import pandas as pd
- from bs4 import BeautifulSoup
- import re
- filename = r"G:\NLPDatasets\bidi_industry\select___from_T_zhao_tou_biao_xin_xi.xlsx"
- df = pd.read_excel(filename)
- save_name = r"G:\NLPDatasets\bidi_industry\bidi_industry_knowledge.txt"
- with open(save_name,"w",encoding="utf8") as f:
- for _idx,(title,content) in enumerate(zip(df["title"],df["content"])):
- _soup = BeautifulSoup(content,"html5lib")
- # print("=========",_idx)
- # print(title)
- # print(_soup.get_text())
- list_line = str(_soup.get_text()).split("\n")
- line_text = ""
- list_line.insert(0,str(title))
- for _line in list_line:
- _line = _line.strip()
- if len(_line)=="":
- continue
- if re.search("扫码关注我们|了解更多精彩|将追究法律责任|来源:|请登录|保标招标网|gov-bid",_line) is not None:
- continue
- line_text += "%s"%(_line)
- if len(line_text)>500:
- f.write(line_text)
- f.write("\n")
- line_text = ""
|