re_channel_103.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. import pandas as pd
  2. import re
  3. # 各投标人
  4. # 各潜在投标人
  5. # 各潜在投标人:
  6. # 致各招标文件持有者:
  7. # 致各投标人
  8. # 各潜在投标供应商:
  9. # 修改、澄清(答疑)纪要内容如下: 1、
  10. # 答疑澄清与修改的主要内容:
  11. # 对文件澄清与修改的主要内容
  12. # 澄清、修改内容要点
  13. # 答疑纪要
  14. # 答疑如下
  15. # 招标文件答疑和招标文件修改通知
  16. # 招标文件答疑通知
  17. # 答疑及补遗通知
  18. # 答疑回复如下:
  19. # 现对投标人提出的质疑回复如下:
  20. # 对文件澄清与修改的主要内容 详见招标文件
  21. # 修改的主要内容 详见附件
  22. # 澄清或修改事项:
  23. # 第1次答疑
  24. # 第1次答疑澄清
  25. # 答疑补遗文件
  26. # 补遗书澄清文件 答疑澄清
  27. # 质疑1
  28. # 问题
  29. # 答疑文件1
  30. # 具体补遗内容详见附件
  31. # 请问 答
  32. # 问题 回复
  33. # 答疑澄清公告 1:
  34. # 现对招标文件作如下澄清:
  35. # 详见答疑澄清文件
  36. # 详见答疑文件。
  37. channel_103 = '(澄清|答疑|补遗|修改)'
  38. channel_103_0 = '(致|至|)(各|各个)(潜在|)(投标|招标|招标文件持有|报价|竞选|)(人|者|供应商|单位)(:|:)'
  39. channel_103_1 = '(澄清|答疑|补遗|修改|质疑)(.?)(具体内容|主要内容|内容|回复|发布|纪要|事项|如下){1,2}(.?)' \
  40. '(如下|[::]|详见|点击下载附件|[1一][::、]|(1)|\\(1\\)|一)'
  41. channel_103_2 = '第(.?)次(答疑|澄清)'
  42. channel_103_3 = '(澄清|答疑|补遗|修改)(公告|文件)'
  43. channel_103_after = '(请问|提问|问题|答复|回复|质疑|答|问){1,2}[12一]?[::]|[一1][::、]|(1)|\\(1\\)|(详见|见)(附件|答疑文件|澄清文件|答疑澄清文件)'
  44. channel_103_4 = '(补充答疑|提疑内容|请问|提问|问题|回复|答复|答疑|质疑|答|问)[12一]?[::]'
  45. channel_103_5 = '(见|详见)(答疑澄清文件|澄清文件|答疑文件)|补遗内容详见附件'
  46. # 答疑澄清时间
  47. # 对文件澄清与修改的主要内容 无澄清文件
  48. # 对文件澄清与修改的主要内容 无
  49. # 请各投标单位自行下载
  50. not_channel_103 = '答疑澄清时间|主要内容.?无|请各投标单位'
  51. def re_standard_channel_103(_str):
  52. channel_103_list = []
  53. if not re.search(channel_103, _str):
  54. print("not")
  55. return channel_103_list
  56. reg_standard = "(?P<value>" + channel_103_0 + ")"
  57. match = re.finditer(reg_standard, _str)
  58. for m in match:
  59. m_dict = m.groupdict()
  60. m_span = m.span()
  61. keyword_index = [m_span[0], m_span[1]]
  62. keyword = m_dict.get('value')
  63. channel_103_list.append([keyword, keyword_index])
  64. if channel_103_list:
  65. print("0", channel_103_list)
  66. return channel_103_list
  67. reg_standard = "(?P<value>" + channel_103_1 + ")"
  68. match = re.finditer(reg_standard, _str)
  69. for m in match:
  70. m_dict = m.groupdict()
  71. m_span = m.span()
  72. keyword_index = [m_span[0], m_span[1]]
  73. keyword = m_dict.get('value')
  74. channel_103_list.append([keyword, keyword_index])
  75. if channel_103_list:
  76. print("1", channel_103_list)
  77. return channel_103_list
  78. reg_standard = "(?P<value>" + channel_103_2 + ")"
  79. match = re.finditer(reg_standard, _str)
  80. for m in match:
  81. m_dict = m.groupdict()
  82. m_span = m.span()
  83. keyword_index = [m_span[0], m_span[1]]
  84. keyword = m_dict.get('value')
  85. if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]):
  86. channel_103_list.append([keyword, keyword_index])
  87. if channel_103_list:
  88. print("2", channel_103_list)
  89. return channel_103_list
  90. reg_standard = "(?P<value>" + channel_103_3 + ")"
  91. match = re.finditer(reg_standard, _str)
  92. for m in match:
  93. m_dict = m.groupdict()
  94. m_span = m.span()
  95. keyword_index = [m_span[0], m_span[1]]
  96. keyword = m_dict.get('value')
  97. if re.search(channel_103_after, _str[keyword_index[1]:keyword_index[1]+50]):
  98. channel_103_list.append([keyword, keyword_index])
  99. if channel_103_list:
  100. print("3", channel_103_list)
  101. return channel_103_list
  102. reg_standard = "(?P<value>" + channel_103_4 + ")"
  103. match = re.finditer(reg_standard, _str)
  104. for m in match:
  105. m_dict = m.groupdict()
  106. m_span = m.span()
  107. keyword_index = [m_span[0], m_span[1]]
  108. keyword = m_dict.get('value')
  109. channel_103_list.append([keyword, keyword_index])
  110. if channel_103_list:
  111. print("4", channel_103_list)
  112. return channel_103_list
  113. reg_standard = "(?P<value>" + channel_103_5 + ")"
  114. match = re.finditer(reg_standard, _str)
  115. for m in match:
  116. m_dict = m.groupdict()
  117. m_span = m.span()
  118. keyword_index = [m_span[0], m_span[1]]
  119. keyword = m_dict.get('value')
  120. channel_103_list.append([keyword, keyword_index])
  121. if channel_103_list:
  122. print("5", channel_103_list)
  123. return channel_103_list
  124. return channel_103_list
  125. def re_not_channel_103(_str):
  126. match = re.findall(not_channel_103, _str)
  127. if match:
  128. for word in match:
  129. instead = "#" * len(word)
  130. _str = re.sub(word, instead, _str)
  131. return _str
  132. def re_channel_103(text):
  133. # 替换易混淆词
  134. clean_text = re_not_channel_103(text)
  135. # 查找符合标准形式的
  136. channel_103_list = re_standard_channel_103(clean_text)
  137. return channel_103_list
  138. def extract_channel_103(text):
  139. result_list = []
  140. channel_103_list = re_channel_103(text)
  141. if channel_103_list:
  142. for word, text_index in channel_103_list:
  143. if word is not None:
  144. if text_index[1]-text_index[0] != len(word) \
  145. or text_index[1]-text_index[0] >= 20:
  146. return []
  147. d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1]}
  148. result_list.append(d)
  149. return result_list
  150. def test_csv(_path):
  151. df = pd.read_csv(_path)
  152. predict_list = []
  153. for index, row in df.iterrows():
  154. word_list = re_channel_103(row["doctextcon"], "")
  155. if word_list:
  156. predict = word_list
  157. else:
  158. predict = []
  159. print("predict", predict)
  160. predict_list.append(str(predict))
  161. predict_df = pd.DataFrame(predict_list)
  162. df = pd.concat([df, predict_df], axis=1)
  163. df.to_csv(_path)
  164. print("finish write!")
  165. def test_str():
  166. s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
  167. s = '''
  168. (第1次澄清) 发布时间:2020-11-25 致各招标文件持有者: 招标人──舟山市
  169. '''
  170. print(extract_channel_103(s))
  171. def test_html():
  172. html_path = "C:/Users/Administrator/Desktop/3.html"
  173. with open(html_path, "r") as f:
  174. s = f.read()
  175. print(extract_channel_103(s, title=""))
  176. if __name__ == "__main__":
  177. path = "D:\\BIDI_DOC\\比地_文档\\澄清答疑_result.csv"
  178. # test_csv(path)
  179. test_str()
  180. # test_html(path)
  181. pass