preprocess.py 1.1 KB

12345678910111213141516171819202122232425262728293031323334
  1. import pandas as pd
  2. from bs4 import BeautifulSoup
  3. import re
  4. filename = r"G:\NLPDatasets\bidi_industry\select___from_T_zhao_tou_biao_xin_xi.xlsx"
  5. df = pd.read_excel(filename)
  6. save_name = r"G:\NLPDatasets\bidi_industry\bidi_industry_knowledge.txt"
  7. with open(save_name,"w",encoding="utf8") as f:
  8. for _idx,(title,content) in enumerate(zip(df["title"],df["content"])):
  9. _soup = BeautifulSoup(content,"html5lib")
  10. # print("=========",_idx)
  11. # print(title)
  12. # print(_soup.get_text())
  13. list_line = str(_soup.get_text()).split("\n")
  14. line_text = ""
  15. list_line.insert(0,str(title))
  16. for _line in list_line:
  17. _line = _line.strip()
  18. if len(_line)=="":
  19. continue
  20. if re.search("扫码关注我们|了解更多精彩|将追究法律责任|来源:|请登录|保标招标网|gov-bid",_line) is not None:
  21. continue
  22. line_text += "%s"%(_line)
  23. if len(line_text)>500:
  24. f.write(line_text)
  25. f.write("\n")
  26. line_text = ""