data_process.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. from BiddingKG.dl.interface.Preprocessing import get_preprocessed_article,get_preprocessed_sentences
  2. import pandas as pd
  3. import re
  4. from BiddingKG.dl.common.nerUtils import getTokens
  5. def preprocess(text):
  6. text = re.sub("\n+",',',text)
  7. text = re.sub("\s+|?+",'',text)
  8. text = re.sub("[\.·_]{2,}", ',', text)
  9. text = re.sub("_", '', text)
  10. text = text[:1800]
  11. sentences = text.split("。")
  12. sentences = [s for s in sentences if s]
  13. if not sentences:
  14. return []
  15. tokens = getTokens(sentences)
  16. new_tokens = []
  17. for t in tokens:
  18. new_tokens.extend(t)
  19. return new_tokens
  20. def data_process1():
  21. data = pd.read_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data01.csv",index_col=0)
  22. text = []
  23. idx = 1
  24. for html_text in data['attachmenthtml']:
  25. res = get_preprocessed_article([[0,html_text,"","",""]])
  26. text.append(res[0].content)
  27. print(idx)
  28. idx += 1
  29. data['text'] = text
  30. data.to_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_process.csv")
  31. def data_process2():
  32. data = pd.read_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_process.csv",index_col=0)
  33. tokens = []
  34. idx = 1
  35. for html_text in data['text']:
  36. _tokens = []
  37. list_articles = get_preprocessed_article([[0,html_text,"","",""]])
  38. list_sentences = get_preprocessed_sentences(list_articles, True)
  39. for sent in list_sentences[0]:
  40. _tokens.extend(sent.tokens)
  41. tokens.append(_tokens)
  42. # print(_tokens)
  43. print(idx)
  44. idx += 1
  45. data['tokens'] = tokens
  46. data.to_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_process.csv")
  47. classes_dict = {
  48. '其他':0,
  49. '招标文件':1,
  50. '限价(控制价)':2,
  51. '工程量清单':3,
  52. '采购清单':4,
  53. '评标办法':5
  54. }
  55. def data_process3():
  56. data = pd.read_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data01.csv",index_col=0)
  57. re_str = []
  58. re_label = []
  59. new_text = []
  60. idx = 0
  61. for text in data['attachmentcon']:
  62. print(idx)
  63. idx+=1
  64. text = str(text)
  65. # text = re.sub("\n+", ',', text)
  66. # text = re.sub("\s+", '', text)
  67. # text = re.sub("?+", '', text)
  68. text_tokens = preprocess(text)
  69. text_tokens = text_tokens[:512]
  70. text = "".join(text_tokens)
  71. text = re.sub("[\.·…]{2,}", ',', text)
  72. # text = text[:800]
  73. new_text.append(text)
  74. if re.search("中标人?公[示告]",text) or re.search('候选人公[示告]',text) or re.search('成交公[示告]',text) or re.search('中标结果公示',text):
  75. re_str.append("中标候选人公示")
  76. re_label.append(classes_dict['其他'])
  77. elif re.search("招标文件",text):
  78. re_str.append("招标文件")
  79. re_label.append(classes_dict['招标文件'])
  80. elif re.search("限价",text) or re.search('控制价',text):
  81. re_str.append("限价(控制价)")
  82. re_label.append(classes_dict['限价(控制价)'])
  83. elif re.search('工程量清单',text):
  84. re_str.append("工程量清单")
  85. re_label.append(classes_dict['工程量清单'])
  86. elif re.search("采购.{0,2}清单",text):
  87. re_str.append("采购清单")
  88. re_label.append(classes_dict['采购清单'])
  89. elif re.search('评标办法',text):
  90. re_str.append("评标办法")
  91. re_label.append(classes_dict['评标办法'])
  92. else:
  93. re_str.append("")
  94. re_label.append('')
  95. data['re_str'] = re_str
  96. data['re_label'] = re_label
  97. data['new_label'] = re_label
  98. data['attachmentcon'] = new_text
  99. # data =data.drop(columns=['attachmenthtml'])
  100. data['attachmenthtml'] = [re.sub('\n{2,}','',i.replace("<div> </div>",'',i))[:4000] for i in data['attachmenthtml']]
  101. label_data = data[data['re_str']!='']
  102. label_data.to_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_relabel01.csv")
  103. no_label_data = data[data['re_str']=='']
  104. no_label_data.to_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_nolabel01.csv")
  105. if __name__ == '__main__':
  106. # data_process1()
  107. # data_process2()
  108. # data_process3()
  109. # data_process4()
  110. pass