re_bidway.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. import ast
  2. import pandas as pd
  3. import re
  4. from BiddingKG.dl.interface import Entitys
  5. def re_bidway(text):
  6. # df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
  7. reg = re.compile(u'(采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式'
  8. u'|发包方式|发包类型|开展方式|招标类型)(.*)'
  9. u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
  10. u'|电子书面竞投|邀请招标|定向公开|询价采购|抽签摇号'
  11. u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
  12. u'|网上招标|其他'
  13. u'|竞谈竞价|网上直购|公开竞谈'
  14. u'|库内邀请|库内公开发包)')
  15. # reg = re.compile(u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源|网络竞价'
  16. # u'|竞争性谈判|公开询价|邀请招标|公开招募|公开询比价|电子书面竞投'
  17. # u'|网上电子投标|比质比价|定向询单|国内比选|电子竞价'
  18. # u'|公开招租|公开竞标方式|网上招标|公开招标|国内竞争性谈判'
  19. # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
  20. # u'|库内邀请|询价采购|询比采购|分散采购|单一来源采购)')
  21. reg2 = re.compile(u'(采用|以|)'
  22. u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
  23. u'|竞争性谈判|询价|电子书面竞投|电子竞价'
  24. u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
  25. u'|网上招标|分散采购'
  26. u'|竞谈竞价|网上直购|公开竞谈'
  27. u'|库内邀请)'
  28. u'(采购方式|方式)')
  29. reg1 = re.compile(
  30. # u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源采购|网络竞价|公开招商方式'
  31. # u'|竞争性谈判|公开询价|询价采购|邀请招标|公开招募|公开询比|电子书面竞投'
  32. # u'|网上电子投标|比质比价|定向询单|询比采购|国内比选|单一来源|公开选取|库内公开发包'
  33. # u'|公开招租|公开竞标方式|网上招标|公开招标|竞争性谈判|公开招投标'
  34. # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
  35. # u'|国际公开竞争性招标)'
  36. u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
  37. u'|竞争性谈判|询价|电子书面竞投'
  38. u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
  39. u'|网上招标|分散采购'
  40. u'|竞谈竞价|网上直购|公开竞谈'
  41. u'|库内邀请)'
  42. )
  43. # 都切为4个字符
  44. # reg1_not = re.compile(u'(及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录)')
  45. reg1_not = re.compile(u'(及单一来|价小组成|除单一来|性谈判邀|询价记录)')
  46. reg3 = re.compile(u'(采购方式:邀请|采购方式:公开|采购方式:询价|分散采购|公开招标|竞价|磋商|询比|竞标|邀请招标|公开招募|公开招租)')
  47. reg_standard = re.compile(u'(公开招标|竞争性磋商|竞争性谈判|单一来源'
  48. u'|竞争性谈判|询价|邀请招标|公开招募|询比|电子书面竞投'
  49. u'|网上电子投标|比质比价|询单|比选'
  50. u'|公开招租|网上招标|分散采购'
  51. u'|网上直购|公开竞谈|采购方式:邀请|采购方式:公开|采购方式:询价)'
  52. )
  53. # text_list = df["text"].to_list()
  54. text_list = []
  55. text_list.append(text)
  56. text_index_list = []
  57. output_list = []
  58. for index in range(len(text_list)):
  59. # 全文下标
  60. text_index = [0, 0]
  61. input_str = text_list[index]
  62. # 把一些混淆的词先替换掉
  63. input_str = re.sub(reg1_not, "####", input_str)
  64. match = reg.search(input_str)
  65. output_str = None
  66. # 根据正则表达式匹配
  67. if match:
  68. # 更新全文下标
  69. text_index[0] = match.start()
  70. text_index[1] = match.end()
  71. # 判断长度,截断
  72. if len(match.group()) >= 15:
  73. ss = re.split(",|\.|,|。|;|;", match.group())
  74. # 判断所需的字符串在哪一段
  75. for i in range(len(ss)):
  76. if re.search(reg1, ss[i]):
  77. output_str = ss[i]
  78. # 更新全文下标
  79. front_len, back_len = calculateLen(ss, i)
  80. text_index[0] = text_index[0] + front_len + i
  81. text_index[1] = text_index[1] - back_len + len(ss) -1 - i
  82. break
  83. else:
  84. output_str = match.group()
  85. else:
  86. match2 = re.search(reg2, input_str)
  87. if match2:
  88. # 更新全文下标
  89. text_index[0] = match2.start()
  90. text_index[1] = match2.end()
  91. output_str = match2.group()
  92. else:
  93. match1 = re.search(reg1, input_str)
  94. if match1:
  95. # 更新全文下标
  96. text_index[0] = match1.start()
  97. text_index[1] = match1.end()
  98. output_str = match1.group()
  99. # 再判断一次长度
  100. if output_str is not None:
  101. if len(output_str) >= 15:
  102. match2 = re.search(reg2, input_str)
  103. if match2:
  104. # 更新全文下标
  105. text_index[0] = match2.start()
  106. text_index[1] = match2.end()
  107. output_str = match2.group()
  108. if len(output_str) >= 15:
  109. match1 = re.search(reg1, input_str)
  110. if match1:
  111. # 更新全文下标
  112. text_index[0] = match1.start()
  113. text_index[1] = match1.end()
  114. output_str = match1.group()
  115. # 最后输出还为空,匹配一些易混淆的词
  116. if output_str is None:
  117. match3 = re.search(reg3, input_str)
  118. if match3:
  119. # 更新全文下标
  120. text_index[0] = match3.start()
  121. text_index[1] = match3.end()
  122. output_str = match3.group()
  123. # 处理前缀等无用词
  124. if output_str is not None:
  125. match5 = re.search("分散采购|采购方式:邀请", output_str)
  126. if not match5:
  127. # 公开采购转为公开招标
  128. output_str = re.sub("公开采购", "公开招标", output_str)
  129. # 去掉第一个字符冒号
  130. ss = re.split(":|:", output_str)
  131. output_str = ss[-1]
  132. # 更新全文下标
  133. front_len, back_len = calculateLen(ss, len(ss) - 1)
  134. text_index[0] = text_index[0] + front_len + len(ss) - 1
  135. # 去掉采购、方式、采用
  136. match6 = re.search("(采用|出售|直接(|现就本次|招标为)", output_str)
  137. match7 = re.search("(采购|方式|进行)", output_str)
  138. output_str = re.sub("(采购|方式|采用|出售|进行|直接(|现就本次|招标为)", "", output_str)
  139. # 更新全文下标
  140. if match6:
  141. text_index[0] += match6.end() - match6.start()
  142. if match7:
  143. text_index[1] -= match7.end() - match7.start()
  144. # 使用标准标签过滤
  145. match4 = re.search(reg_standard, output_str)
  146. if match4:
  147. output_str = match4.group()
  148. # 更新全文下标
  149. text_index[0] += match4.start()
  150. text_index[1] = text_index[0] + match4.end() - match4.start()
  151. output_list.append(output_str)
  152. # text_index_list.append(str(text_index))
  153. text_index_list.append(text_index)
  154. # df["re"] = pd.DataFrame(output_list)
  155. # df["text_index"] = pd.DataFrame(text_index_list)
  156. # index_to_word = []
  157. # for index, row in df.iterrows():
  158. # i_list = ast.literal_eval(row["text_index"])
  159. # word = row["text"][i_list[0]:i_list[1]]
  160. # if len(word) >= 20:
  161. # word = ""
  162. # index_to_word.append(word)
  163. # df["index2word"] = pd.DataFrame(index_to_word)
  164. # df.to_csv("C:\\Users\\admin\\Desktop\\bidway_text2.csv")
  165. return output_list[0], text_index_list[0]
  166. def calculateLen(ss, i):
  167. front_len = 0
  168. back_len = 0
  169. for index in range(i):
  170. front_len += len(ss[index])
  171. for index in range(i+1, len(ss)):
  172. back_len += len(ss[index])
  173. return front_len, back_len
  174. def extract_bidway(text):
  175. list_bidway = []
  176. word, text_index_list = re_bidway(text)
  177. if word is not None:
  178. d = {"body": word, "begin_index": text_index_list[0], "end_index": text_index_list[1]}
  179. list_bidway.append(d)
  180. # print(d)
  181. return list_bidway
  182. if __name__ == "__main__":
  183. # df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
  184. # s = df["text"].iloc[1]
  185. s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
  186. extract_bidway(s)