import pandas as pd from bs4 import BeautifulSoup import re filename = r"G:\NLPDatasets\bidi_industry\select___from_T_zhao_tou_biao_xin_xi.xlsx" df = pd.read_excel(filename) save_name = r"G:\NLPDatasets\bidi_industry\bidi_industry_knowledge.txt" with open(save_name,"w",encoding="utf8") as f: for _idx,(title,content) in enumerate(zip(df["title"],df["content"])): _soup = BeautifulSoup(content,"html5lib") # print("=========",_idx) # print(title) # print(_soup.get_text()) list_line = str(_soup.get_text()).split("\n") line_text = "" list_line.insert(0,str(title)) for _line in list_line: _line = _line.strip() if len(_line)=="": continue if re.search("扫码关注我们|了解更多精彩|将追究法律责任|来源:|请登录|保标招标网|gov-bid",_line) is not None: continue line_text += "%s"%(_line) if len(line_text)>500: f.write(line_text) f.write("\n") line_text = ""