|
@@ -1,222 +1,384 @@
|
|
import ast
|
|
import ast
|
|
-
|
|
|
|
import pandas as pd
|
|
import pandas as pd
|
|
import re
|
|
import re
|
|
|
|
|
|
-from BiddingKG.dl.interface import Entitys
|
|
|
|
-
|
|
|
|
-def re_bidway(text):
|
|
|
|
- # df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
|
|
|
|
-
|
|
|
|
- reg = re.compile(u'(采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式'
|
|
|
|
- u'|发包方式|发包类型|开展方式|招标类型)(.*)'
|
|
|
|
- u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
|
|
|
|
- u'|电子书面竞投|邀请招标|定向公开|询价采购|抽签摇号'
|
|
|
|
- u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
|
|
|
|
- u'|网上招标|其他'
|
|
|
|
- u'|竞谈竞价|网上直购|公开竞谈'
|
|
|
|
- u'|库内邀请|库内公开发包)')
|
|
|
|
-
|
|
|
|
- # reg = re.compile(u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源|网络竞价'
|
|
|
|
- # u'|竞争性谈判|公开询价|邀请招标|公开招募|公开询比价|电子书面竞投'
|
|
|
|
- # u'|网上电子投标|比质比价|定向询单|国内比选|电子竞价'
|
|
|
|
- # u'|公开招租|公开竞标方式|网上招标|公开招标|国内竞争性谈判'
|
|
|
|
- # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
|
|
|
|
- # u'|库内邀请|询价采购|询比采购|分散采购|单一来源采购)')
|
|
|
|
-
|
|
|
|
- reg2 = re.compile(u'(采用|以|)'
|
|
|
|
- u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
|
|
|
|
- u'|竞争性谈判|询价|电子书面竞投|电子竞价'
|
|
|
|
- u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
|
|
|
|
- u'|网上招标|分散采购'
|
|
|
|
- u'|竞谈竞价|网上直购|公开竞谈'
|
|
|
|
- u'|库内邀请)'
|
|
|
|
- u'(采购方式|方式)')
|
|
|
|
-
|
|
|
|
- reg1 = re.compile(
|
|
|
|
- # u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源采购|网络竞价|公开招商方式'
|
|
|
|
- # u'|竞争性谈判|公开询价|询价采购|邀请招标|公开招募|公开询比|电子书面竞投'
|
|
|
|
- # u'|网上电子投标|比质比价|定向询单|询比采购|国内比选|单一来源|公开选取|库内公开发包'
|
|
|
|
- # u'|公开招租|公开竞标方式|网上招标|公开招标|竞争性谈判|公开招投标'
|
|
|
|
- # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
|
|
|
|
- # u'|国际公开竞争性招标)'
|
|
|
|
- u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
|
|
|
|
- u'|竞争性谈判|询价|电子书面竞投'
|
|
|
|
- u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
|
|
|
|
- u'|网上招标|分散采购'
|
|
|
|
- u'|竞谈竞价|网上直购|公开竞谈'
|
|
|
|
- u'|库内邀请)'
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- # 都切为4个字符
|
|
|
|
- # reg1_not = re.compile(u'(及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录)')
|
|
|
|
- reg1_not = re.compile(u'(及单一来|价小组成|除单一来|性谈判邀|询价记录)')
|
|
|
|
-
|
|
|
|
- reg3 = re.compile(u'(采购方式:邀请|采购方式:公开|采购方式:询价|分散采购|公开招标|竞价|磋商|询比|竞标|邀请招标|公开招募|公开招租)')
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- reg_standard = re.compile(u'(公开招标|竞争性磋商|竞争性谈判|单一来源'
|
|
|
|
- u'|竞争性谈判|询价|邀请招标|公开招募|询比|电子书面竞投'
|
|
|
|
- u'|网上电子投标|比质比价|询单|比选'
|
|
|
|
- u'|公开招租|网上招标|分散采购'
|
|
|
|
- u'|网上直购|公开竞谈|采购方式:邀请|采购方式:公开|采购方式:询价)'
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- # text_list = df["text"].to_list()
|
|
|
|
- text_list = []
|
|
|
|
- text_list.append(text)
|
|
|
|
- text_index_list = []
|
|
|
|
- output_list = []
|
|
|
|
- for index in range(len(text_list)):
|
|
|
|
- # 全文下标
|
|
|
|
- text_index = [0, 0]
|
|
|
|
-
|
|
|
|
- input_str = text_list[index]
|
|
|
|
-
|
|
|
|
- # 把一些混淆的词先替换掉
|
|
|
|
- input_str = re.sub(reg1_not, "####", input_str)
|
|
|
|
-
|
|
|
|
- match = reg.search(input_str)
|
|
|
|
- output_str = None
|
|
|
|
- # 根据正则表达式匹配
|
|
|
|
- if match:
|
|
|
|
- # 更新全文下标
|
|
|
|
- text_index[0] = match.start()
|
|
|
|
- text_index[1] = match.end()
|
|
|
|
-
|
|
|
|
- # 判断长度,截断
|
|
|
|
- if len(match.group()) >= 15:
|
|
|
|
- ss = re.split(",|\.|,|。|;|;", match.group())
|
|
|
|
- # 判断所需的字符串在哪一段
|
|
|
|
- for i in range(len(ss)):
|
|
|
|
- if re.search(reg1, ss[i]):
|
|
|
|
- output_str = ss[i]
|
|
|
|
-
|
|
|
|
- # 更新全文下标
|
|
|
|
- front_len, back_len = calculateLen(ss, i)
|
|
|
|
- text_index[0] = text_index[0] + front_len + i
|
|
|
|
- text_index[1] = text_index[1] - back_len + len(ss) -1 - i
|
|
|
|
-
|
|
|
|
- break
|
|
|
|
|
|
+# from BiddingKG.dl.interface import Entitys
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+# def re_bidway_old(text):
|
|
|
|
+# df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
|
|
|
|
+#
|
|
|
|
+# reg = re.compile(u'(采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式'
|
|
|
|
+# u'|发包方式|发包类型|开展方式|招标类型)(.*)'
|
|
|
|
+# u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
|
|
|
|
+# u'|电子书面竞投|邀请招标|定向公开|询价采购|抽签摇号'
|
|
|
|
+# u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
|
|
|
|
+# u'|网上招标|其他'
|
|
|
|
+# u'|竞谈竞价|网上直购|公开竞谈'
|
|
|
|
+# u'|库内邀请|库内公开发包)')
|
|
|
|
+#
|
|
|
|
+# # reg = re.compile(u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源|网络竞价'
|
|
|
|
+# # u'|竞争性谈判|公开询价|邀请招标|公开招募|公开询比价|电子书面竞投'
|
|
|
|
+# # u'|网上电子投标|比质比价|定向询单|国内比选|电子竞价'
|
|
|
|
+# # u'|公开招租|公开竞标方式|网上招标|公开招标|国内竞争性谈判'
|
|
|
|
+# # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
|
|
|
|
+# # u'|库内邀请|询价采购|询比采购|分散采购|单一来源采购)')
|
|
|
|
+#
|
|
|
|
+# reg2 = re.compile(u'(采用|以|)'
|
|
|
|
+# u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
|
|
|
|
+# u'|竞争性谈判|询价|电子书面竞投|电子竞价'
|
|
|
|
+# u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
|
|
|
|
+# u'|网上招标|分散采购'
|
|
|
|
+# u'|竞谈竞价|网上直购|公开竞谈'
|
|
|
|
+# u'|库内邀请)'
|
|
|
|
+# u'(采购方式|方式)')
|
|
|
|
+#
|
|
|
|
+# reg1 = re.compile(
|
|
|
|
+# # u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源采购|网络竞价|公开招商方式'
|
|
|
|
+# # u'|竞争性谈判|公开询价|询价采购|邀请招标|公开招募|公开询比|电子书面竞投'
|
|
|
|
+# # u'|网上电子投标|比质比价|定向询单|询比采购|国内比选|单一来源|公开选取|库内公开发包'
|
|
|
|
+# # u'|公开招租|公开竞标方式|网上招标|公开招标|竞争性谈判|公开招投标'
|
|
|
|
+# # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
|
|
|
|
+# # u'|国际公开竞争性招标)'
|
|
|
|
+# u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
|
|
|
|
+# u'|竞争性谈判|询价|电子书面竞投'
|
|
|
|
+# u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
|
|
|
|
+# u'|网上招标|分散采购'
|
|
|
|
+# u'|竞谈竞价|网上直购|公开竞谈'
|
|
|
|
+# u'|库内邀请)'
|
|
|
|
+# )
|
|
|
|
+#
|
|
|
|
+# # 都切为4个字符
|
|
|
|
+# # reg1_not = re.compile(u'(及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录)')
|
|
|
|
+# reg1_not = re.compile(u'(及单一来|价小组成|除单一来|性谈判邀|询价记录)')
|
|
|
|
+#
|
|
|
|
+# reg3 = re.compile(u'(采购方式:邀请|采购方式:公开|采购方式:询价|分散采购|公开招标|竞价|磋商|询比|竞标|邀请招标|公开招募|公开招租)')
|
|
|
|
+#
|
|
|
|
+#
|
|
|
|
+# reg_standard = re.compile(u'(公开招标|竞争性磋商|竞争性谈判|单一来源'
|
|
|
|
+# u'|竞争性谈判|询价|邀请招标|公开招募|询比|电子书面竞投'
|
|
|
|
+# u'|网上电子投标|比质比价|询单|比选'
|
|
|
|
+# u'|公开招租|网上招标|分散采购'
|
|
|
|
+# u'|网上直购|公开竞谈|采购方式:邀请|采购方式:公开|采购方式:询价)'
|
|
|
|
+# )
|
|
|
|
+#
|
|
|
|
+# text_list = df["text"].to_list()
|
|
|
|
+# # text_list = []
|
|
|
|
+# # text_list.append(text)
|
|
|
|
+# text_index_list = []
|
|
|
|
+# output_list = []
|
|
|
|
+# for index in range(len(text_list)):
|
|
|
|
+# # 全文下标
|
|
|
|
+# text_index = [0, 0]
|
|
|
|
+#
|
|
|
|
+# input_str = text_list[index]
|
|
|
|
+#
|
|
|
|
+# # 把一些混淆的词先替换掉
|
|
|
|
+# input_str = re.sub(reg1_not, "####", input_str)
|
|
|
|
+#
|
|
|
|
+# match = reg.search(input_str)
|
|
|
|
+# output_str = None
|
|
|
|
+# # 根据正则表达式匹配
|
|
|
|
+# if match:
|
|
|
|
+# # 更新全文下标
|
|
|
|
+# text_index[0] = match.start()
|
|
|
|
+# text_index[1] = match.end()
|
|
|
|
+#
|
|
|
|
+# # 判断长度,截断
|
|
|
|
+# if len(match.group()) >= 15:
|
|
|
|
+# ss = re.split(",|\.|,|。|;|;", match.group())
|
|
|
|
+# # 判断所需的字符串在哪一段
|
|
|
|
+# for i in range(len(ss)):
|
|
|
|
+# if re.search(reg1, ss[i]):
|
|
|
|
+# output_str = ss[i]
|
|
|
|
+#
|
|
|
|
+# # 更新全文下标
|
|
|
|
+# front_len, back_len = calculateLen(ss, i)
|
|
|
|
+# text_index[0] = text_index[0] + front_len + i
|
|
|
|
+# text_index[1] = text_index[1] - back_len + len(ss) -1 - i
|
|
|
|
+#
|
|
|
|
+# break
|
|
|
|
+# else:
|
|
|
|
+# output_str = match.group()
|
|
|
|
+#
|
|
|
|
+# else:
|
|
|
|
+# match2 = re.search(reg2, input_str)
|
|
|
|
+# if match2:
|
|
|
|
+# # 更新全文下标
|
|
|
|
+# text_index[0] = match2.start()
|
|
|
|
+# text_index[1] = match2.end()
|
|
|
|
+#
|
|
|
|
+# output_str = match2.group()
|
|
|
|
+#
|
|
|
|
+# else:
|
|
|
|
+# match1 = re.search(reg1, input_str)
|
|
|
|
+# if match1:
|
|
|
|
+# # 更新全文下标
|
|
|
|
+# text_index[0] = match1.start()
|
|
|
|
+# text_index[1] = match1.end()
|
|
|
|
+# output_str = match1.group()
|
|
|
|
+#
|
|
|
|
+# # 再判断一次长度
|
|
|
|
+# if output_str is not None:
|
|
|
|
+# if len(output_str) >= 15:
|
|
|
|
+# match2 = re.search(reg2, input_str)
|
|
|
|
+# if match2:
|
|
|
|
+# # 更新全文下标
|
|
|
|
+# text_index[0] = match2.start()
|
|
|
|
+# text_index[1] = match2.end()
|
|
|
|
+#
|
|
|
|
+# output_str = match2.group()
|
|
|
|
+# if len(output_str) >= 15:
|
|
|
|
+# match1 = re.search(reg1, input_str)
|
|
|
|
+# if match1:
|
|
|
|
+# # 更新全文下标
|
|
|
|
+# text_index[0] = match1.start()
|
|
|
|
+# text_index[1] = match1.end()
|
|
|
|
+#
|
|
|
|
+# output_str = match1.group()
|
|
|
|
+#
|
|
|
|
+# # 最后输出还为空,匹配一些易混淆的词
|
|
|
|
+# if output_str is None:
|
|
|
|
+# match3 = re.search(reg3, input_str)
|
|
|
|
+# if match3:
|
|
|
|
+# # 更新全文下标
|
|
|
|
+# text_index[0] = match3.start()
|
|
|
|
+# text_index[1] = match3.end()
|
|
|
|
+#
|
|
|
|
+# output_str = match3.group()
|
|
|
|
+#
|
|
|
|
+# # 处理前缀等无用词
|
|
|
|
+# if output_str is not None:
|
|
|
|
+# match5 = re.search("分散采购|采购方式:邀请", output_str)
|
|
|
|
+# if not match5:
|
|
|
|
+# # 公开采购转为公开招标
|
|
|
|
+# output_str = re.sub("公开采购", "公开招标", output_str)
|
|
|
|
+#
|
|
|
|
+# # 去掉第一个字符冒号
|
|
|
|
+# ss = re.split(":|:", output_str)
|
|
|
|
+# output_str = ss[-1]
|
|
|
|
+# # 更新全文下标
|
|
|
|
+# front_len, back_len = calculateLen(ss, len(ss) - 1)
|
|
|
|
+# text_index[0] = text_index[0] + front_len + len(ss) - 1
|
|
|
|
+#
|
|
|
|
+# # 去掉采购、方式、采用
|
|
|
|
+# match6 = re.search("(采用|出售|直接(|现就本次|招标为)", output_str)
|
|
|
|
+# match7 = re.search("(采购|方式|进行)", output_str)
|
|
|
|
+# output_str = re.sub("(采购|方式|采用|出售|进行|直接(|现就本次|招标为)", "", output_str)
|
|
|
|
+# # 更新全文下标
|
|
|
|
+# if match6:
|
|
|
|
+# text_index[0] += match6.end() - match6.start()
|
|
|
|
+# if match7:
|
|
|
|
+# text_index[1] -= match7.end() - match7.start()
|
|
|
|
+#
|
|
|
|
+# # 使用标准标签过滤
|
|
|
|
+# match4 = re.search(reg_standard, output_str)
|
|
|
|
+# if match4:
|
|
|
|
+# output_str = match4.group()
|
|
|
|
+# # 更新全文下标
|
|
|
|
+# text_index[0] += match4.start()
|
|
|
|
+# text_index[1] = text_index[0] + match4.end() - match4.start()
|
|
|
|
+#
|
|
|
|
+# output_list.append(output_str)
|
|
|
|
+# # text_index_list.append(str(text_index))
|
|
|
|
+# text_index_list.append(text_index)
|
|
|
|
+#
|
|
|
|
+# # df["re"] = pd.DataFrame(output_list)
|
|
|
|
+# # df["text_index"] = pd.DataFrame(text_index_list)
|
|
|
|
+#
|
|
|
|
+# # index_to_word = []
|
|
|
|
+# # for index, row in df.iterrows():
|
|
|
|
+# # i_list = ast.literal_eval(row["text_index"])
|
|
|
|
+# # word = row["text"][i_list[0]:i_list[1]]
|
|
|
|
+# # if len(word) >= 20:
|
|
|
|
+# # word = ""
|
|
|
|
+# # index_to_word.append(word)
|
|
|
|
+#
|
|
|
|
+#
|
|
|
|
+# # df["index2word"] = pd.DataFrame(index_to_word)
|
|
|
|
+# # df.to_csv("C:\\Users\\admin\\Desktop\\bidway_text2.csv")
|
|
|
|
+#
|
|
|
|
+# return output_list[0], text_index_list[0]
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+bidway = '单一来源' \
|
|
|
|
+ '|国内竞争性磋商|竞争性磋商|竞争性谈判|网络竞价|网上竞价|公开竞谈|公开竞价|电子竞价|竞价|竞标|竞谈竞价|电子书面竞投' \
|
|
|
|
+ '|公开比选|比质比价|比选' \
|
|
|
|
+ '|公开招标|公开招租|公开招募|公开选取|公开招投标' \
|
|
|
|
+ '|网上直购|网上招标|网上电子投标|网上挂牌' \
|
|
|
|
+ '|邀请招标' \
|
|
|
|
+ '|网上询价|公开询价|非定向询价|定向询价|询比价|询单|询价|询比' \
|
|
|
|
+ '|库内邀请|库内公开发包|内部邀标' \
|
|
|
|
+ '|定点采购议价|定点采购' \
|
|
|
|
+ '|竞争性评审'
|
|
|
|
+
|
|
|
|
+not_bidway = '及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录|自由竞价' \
|
|
|
|
+ '|限时竞价|咨询单位|询价单'
|
|
|
|
+
|
|
|
|
+not_bidway_preffix = "本次|拟|参加|无效|标的|联合体|参与|否决|除"
|
|
|
|
+
|
|
|
|
+not_bidway_suffix = "文件|报名|邀请|项目|失败|数量|编号|后|时间|类型|名称|和|成交" \
|
|
|
|
+ "|标题|开始|结束|产品|报价|供应商|部门|监督|需求|范围|入围|内容|人" \
|
|
|
|
+ "|条件|公司|保证金|完毕|事件|成功|活动|地点|标|会|须知|范围" \
|
|
|
|
+ "|响应|报价|采购公示|的原因|采购供应商|价|采购人员|失败"
|
|
|
|
+
|
|
|
|
+bidway_preffix = '采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式' \
|
|
|
|
+ '|发包方式|发包类型|开展方式|招标类型|选取方式|招租方式'
|
|
|
|
+
|
|
|
|
+bidway_special = '采购方式:公开|采购方式:邀请|采购方式:询价' \
|
|
|
|
+ '|招标方式:.公开|采购方式:.公开' \
|
|
|
|
+ '|分散采购' \
|
|
|
|
+ ''
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def re_not_bidway(_str):
|
|
|
|
+ match = re.findall(not_bidway, _str)
|
|
|
|
+ if match:
|
|
|
|
+ for word in match:
|
|
|
|
+ instead = "#" * len(word)
|
|
|
|
+ _str = re.sub(word, instead, _str)
|
|
|
|
+
|
|
|
|
+ reg_not1 = "(" + bidway + ")" + "(" + not_bidway_suffix + ")"
|
|
|
|
+ match = re.findall(reg_not1, _str)
|
|
|
|
+ if match:
|
|
|
|
+ for word in match:
|
|
|
|
+ word_add = ""
|
|
|
|
+ for w in word:
|
|
|
|
+ word_add += w
|
|
|
|
+ instead = "#" * len(word_add)
|
|
|
|
+ _str = re.sub(word_add, instead, _str)
|
|
|
|
+
|
|
|
|
+ reg_not2 = "(" + not_bidway_preffix + ")" + "(" + bidway + ")"
|
|
|
|
+ match = re.findall(reg_not2, _str)
|
|
|
|
+ if match:
|
|
|
|
+ for word in match:
|
|
|
|
+ word_add = ""
|
|
|
|
+ for w in word:
|
|
|
|
+ word_add += w
|
|
|
|
+ instead = "#" * len(word_add)
|
|
|
|
+ _str = re.sub(word_add, instead, _str)
|
|
|
|
+ return _str
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def re_standard_bidway(_str):
|
|
|
|
+ reg_standard = "(?P<preffix>" + bidway_preffix + ")" \
|
|
|
|
+ + "(?P<char>.{1,2})" \
|
|
|
|
+ + "(?P<value>" + bidway + ")"
|
|
|
|
+ match = re.finditer(reg_standard, _str)
|
|
|
|
+ bidway_list = []
|
|
|
|
+ if match:
|
|
|
|
+ for m in match:
|
|
|
|
+ m_dict = m.groupdict()
|
|
|
|
+ m_span = m.span()
|
|
|
|
+ keyword = ""
|
|
|
|
+ keyword_index = [m_span[0], m_span[1]]
|
|
|
|
+ for key in m_dict.keys():
|
|
|
|
+ if key == "value":
|
|
|
|
+ keyword = m_dict.get(key)
|
|
|
|
+ else:
|
|
|
|
+ keyword_index[0] += len(m_dict.get(key))
|
|
|
|
+ bidway_list.append([keyword, keyword_index])
|
|
|
|
+
|
|
|
|
+ return bidway_list
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def re_all_bidway(_str):
|
|
|
|
+ reg_all = "(?P<value>" + bidway + ")"
|
|
|
|
+ match = re.finditer(reg_all, _str)
|
|
|
|
+ bidway_list = []
|
|
|
|
+ if match:
|
|
|
|
+ for m in match:
|
|
|
|
+ keyword = m.group()
|
|
|
|
+ keyword_index = list(m.span())
|
|
|
|
+ bidway_list.append([keyword, keyword_index])
|
|
|
|
+ return bidway_list
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def re_special_bidway(_str):
|
|
|
|
+ reg_special = "(?P<value>" + bidway_special + ")"
|
|
|
|
+ match = re.finditer(reg_special, _str)
|
|
|
|
+ bidway_list = []
|
|
|
|
+ if match:
|
|
|
|
+ for m in match:
|
|
|
|
+ keyword = m.group()
|
|
|
|
+ keyword_index = list(m.span())
|
|
|
|
+ bidway_list.append([keyword, keyword_index])
|
|
|
|
+ return bidway_list
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_one_word(bidway_list):
|
|
|
|
+ # 若有多个,去重,输出较长的
|
|
|
|
+ word = None
|
|
|
|
+ text_index = [0, 0]
|
|
|
|
+ if len(bidway_list) > 1:
|
|
|
|
+ word_dict = {}
|
|
|
|
+ for bw in bidway_list:
|
|
|
|
+ if bw[0] in word_dict.keys():
|
|
|
|
+ if bw[1][0] < word_dict.get(bw[0])[0]:
|
|
|
|
+ word_dict[bw[0]] = bw[1]
|
|
else:
|
|
else:
|
|
- output_str = match.group()
|
|
|
|
-
|
|
|
|
|
|
+ word_dict[bw[0]] = bw[1]
|
|
|
|
+
|
|
|
|
+ word_list = []
|
|
|
|
+ for key in word_dict.keys():
|
|
|
|
+ word_list.append([key, word_dict.get(key)[0]])
|
|
|
|
+
|
|
|
|
+ if len(word_list) > 1:
|
|
|
|
+ word_list.sort(key=lambda x: (-int(x[1]), len(x[0])))
|
|
|
|
+ word = word_list[-1][0]
|
|
|
|
+ text_index = word_dict.get(word)
|
|
|
|
+ elif word_list:
|
|
|
|
+ word = word_list[0][0]
|
|
|
|
+ text_index = word_dict.get(word)
|
|
else:
|
|
else:
|
|
- match2 = re.search(reg2, input_str)
|
|
|
|
- if match2:
|
|
|
|
- # 更新全文下标
|
|
|
|
- text_index[0] = match2.start()
|
|
|
|
- text_index[1] = match2.end()
|
|
|
|
-
|
|
|
|
- output_str = match2.group()
|
|
|
|
-
|
|
|
|
- else:
|
|
|
|
- match1 = re.search(reg1, input_str)
|
|
|
|
- if match1:
|
|
|
|
- # 更新全文下标
|
|
|
|
- text_index[0] = match1.start()
|
|
|
|
- text_index[1] = match1.end()
|
|
|
|
- output_str = match1.group()
|
|
|
|
-
|
|
|
|
- # 再判断一次长度
|
|
|
|
- if output_str is not None:
|
|
|
|
- if len(output_str) >= 15:
|
|
|
|
- match2 = re.search(reg2, input_str)
|
|
|
|
- if match2:
|
|
|
|
- # 更新全文下标
|
|
|
|
- text_index[0] = match2.start()
|
|
|
|
- text_index[1] = match2.end()
|
|
|
|
-
|
|
|
|
- output_str = match2.group()
|
|
|
|
- if len(output_str) >= 15:
|
|
|
|
- match1 = re.search(reg1, input_str)
|
|
|
|
- if match1:
|
|
|
|
- # 更新全文下标
|
|
|
|
- text_index[0] = match1.start()
|
|
|
|
- text_index[1] = match1.end()
|
|
|
|
-
|
|
|
|
- output_str = match1.group()
|
|
|
|
-
|
|
|
|
- # 最后输出还为空,匹配一些易混淆的词
|
|
|
|
- if output_str is None:
|
|
|
|
- match3 = re.search(reg3, input_str)
|
|
|
|
- if match3:
|
|
|
|
- # 更新全文下标
|
|
|
|
- text_index[0] = match3.start()
|
|
|
|
- text_index[1] = match3.end()
|
|
|
|
-
|
|
|
|
- output_str = match3.group()
|
|
|
|
-
|
|
|
|
- # 处理前缀等无用词
|
|
|
|
- if output_str is not None:
|
|
|
|
- match5 = re.search("分散采购|采购方式:邀请", output_str)
|
|
|
|
- if not match5:
|
|
|
|
- # 公开采购转为公开招标
|
|
|
|
- output_str = re.sub("公开采购", "公开招标", output_str)
|
|
|
|
-
|
|
|
|
- # 去掉第一个字符冒号
|
|
|
|
- ss = re.split(":|:", output_str)
|
|
|
|
- output_str = ss[-1]
|
|
|
|
- # 更新全文下标
|
|
|
|
- front_len, back_len = calculateLen(ss, len(ss) - 1)
|
|
|
|
- text_index[0] = text_index[0] + front_len + len(ss) - 1
|
|
|
|
-
|
|
|
|
- # 去掉采购、方式、采用
|
|
|
|
- match6 = re.search("(采用|出售|直接(|现就本次|招标为)", output_str)
|
|
|
|
- match7 = re.search("(采购|方式|进行)", output_str)
|
|
|
|
- output_str = re.sub("(采购|方式|采用|出售|进行|直接(|现就本次|招标为)", "", output_str)
|
|
|
|
- # 更新全文下标
|
|
|
|
- if match6:
|
|
|
|
- text_index[0] += match6.end() - match6.start()
|
|
|
|
- if match7:
|
|
|
|
- text_index[1] -= match7.end() - match7.start()
|
|
|
|
-
|
|
|
|
- # 使用标准标签过滤
|
|
|
|
- match4 = re.search(reg_standard, output_str)
|
|
|
|
- if match4:
|
|
|
|
- output_str = match4.group()
|
|
|
|
- # 更新全文下标
|
|
|
|
- text_index[0] += match4.start()
|
|
|
|
- text_index[1] = text_index[0] + match4.end() - match4.start()
|
|
|
|
-
|
|
|
|
- output_list.append(output_str)
|
|
|
|
- # text_index_list.append(str(text_index))
|
|
|
|
- text_index_list.append(text_index)
|
|
|
|
-
|
|
|
|
- # df["re"] = pd.DataFrame(output_list)
|
|
|
|
- # df["text_index"] = pd.DataFrame(text_index_list)
|
|
|
|
-
|
|
|
|
- # index_to_word = []
|
|
|
|
- # for index, row in df.iterrows():
|
|
|
|
- # i_list = ast.literal_eval(row["text_index"])
|
|
|
|
- # word = row["text"][i_list[0]:i_list[1]]
|
|
|
|
- # if len(word) >= 20:
|
|
|
|
- # word = ""
|
|
|
|
- # index_to_word.append(word)
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- # df["index2word"] = pd.DataFrame(index_to_word)
|
|
|
|
- # df.to_csv("C:\\Users\\admin\\Desktop\\bidway_text2.csv")
|
|
|
|
-
|
|
|
|
- return output_list[0], text_index_list[0]
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def calculateLen(ss, i):
|
|
|
|
- front_len = 0
|
|
|
|
- back_len = 0
|
|
|
|
- for index in range(i):
|
|
|
|
- front_len += len(ss[index])
|
|
|
|
- for index in range(i+1, len(ss)):
|
|
|
|
- back_len += len(ss[index])
|
|
|
|
- return front_len, back_len
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def extract_bidway(text):
|
|
|
|
|
|
+ text_index = [0, 0]
|
|
|
|
+ elif len(bidway_list) == 1:
|
|
|
|
+ word = bidway_list[0][0]
|
|
|
|
+ text_index = bidway_list[0][1]
|
|
|
|
+ return word, text_index
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def re_bidway(text, title):
|
|
|
|
+ # 替换易混淆词
|
|
|
|
+ text_clean = re_not_bidway(text)
|
|
|
|
+ title_clean = re_not_bidway(title)
|
|
|
|
+
|
|
|
|
+ # 查找符合标准形式的
|
|
|
|
+ bidway_list = re_standard_bidway(text_clean)
|
|
|
|
+ if bidway_list:
|
|
|
|
+ word = bidway_list[0][0]
|
|
|
|
+ text_index = bidway_list[0][1]
|
|
|
|
+ return word, text_index
|
|
|
|
+
|
|
|
|
+ # 无符合标准形式的,查找title里的所有形式
|
|
|
|
+ bidway_list = re_all_bidway(title_clean)
|
|
|
|
+ if bidway_list:
|
|
|
|
+ word, text_index = get_one_word(bidway_list)
|
|
|
|
+ return word, text_index
|
|
|
|
+
|
|
|
|
+ # 无符合标准形式的,查找所有形式
|
|
|
|
+ bidway_list = re_all_bidway(text_clean)
|
|
|
|
+ if bidway_list:
|
|
|
|
+ word, text_index = get_one_word(bidway_list)
|
|
|
|
+ return word, text_index
|
|
|
|
+
|
|
|
|
+ # 还无结果,查找特殊形式
|
|
|
|
+ bidway_list = re_special_bidway(text_clean)
|
|
|
|
+ if bidway_list:
|
|
|
|
+ word = bidway_list[0][0]
|
|
|
|
+ text_index = bidway_list[0][1]
|
|
|
|
+ return word, text_index
|
|
|
|
+
|
|
|
|
+ # 查无结果
|
|
|
|
+ return None, [0, 0]
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def extract_bidway(text, title):
|
|
list_bidway = []
|
|
list_bidway = []
|
|
- word, text_index_list = re_bidway(text)
|
|
|
|
|
|
+ word, text_index_list = re_bidway(text, title)
|
|
if word is not None:
|
|
if word is not None:
|
|
if text_index_list[1]-text_index_list[0] != len(word) \
|
|
if text_index_list[1]-text_index_list[0] != len(word) \
|
|
- or text_index_list[1]-text_index_list[0] >= 30:
|
|
|
|
|
|
+ or text_index_list[1]-text_index_list[0] >= 10:
|
|
return []
|
|
return []
|
|
d = {"body": word, "begin_index": text_index_list[0], "end_index": text_index_list[1]}
|
|
d = {"body": word, "begin_index": text_index_list[0], "end_index": text_index_list[1]}
|
|
list_bidway.append(d)
|
|
list_bidway.append(d)
|
|
@@ -224,8 +386,62 @@ def extract_bidway(text):
|
|
return list_bidway
|
|
return list_bidway
|
|
|
|
|
|
|
|
|
|
-if __name__ == "__main__":
|
|
|
|
- # df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
|
|
|
|
- # s = df["text"].iloc[1]
|
|
|
|
|
|
+def test_csv():
|
|
|
|
+ df = pd.read_csv("C:\\Users\\Administrator\\Desktop\\bidway_text.csv")
|
|
|
|
+
|
|
|
|
+ predict_list = []
|
|
|
|
+ for index, row in df.iterrows():
|
|
|
|
+ word, text_index = re_bidway(row["text"], "")
|
|
|
|
+ if word:
|
|
|
|
+ predict = [word, text_index]
|
|
|
|
+ else:
|
|
|
|
+ predict = []
|
|
|
|
+ print("predict", predict)
|
|
|
|
+ predict_list.append(str(predict))
|
|
|
|
+
|
|
|
|
+ predict_df = pd.DataFrame(predict_list)
|
|
|
|
+ df = pd.concat([df, predict_df], axis=1)
|
|
|
|
+
|
|
|
|
+ df.to_csv("C:\\Users\\Administrator\\Desktop\\bidway_result.csv")
|
|
|
|
+ print("finish write!")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def test_str():
|
|
s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
|
|
s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
|
|
- extract_bidway(s)
|
|
|
|
|
|
+ s = '''
|
|
|
|
+ ,关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知,各投标人:深圳市国际招标有限公司受中共
|
|
|
|
+ 深圳市委军民融合发展委员会办公室委托,就人防工程技术咨询服务项目【重新招标】(项目编号:0658-2171
|
|
|
|
+ 1A60965),进行公开招标,因投标单位不足三家,公开招标失败,现经采购单位同意,采用单一来源谈判方式
|
|
|
|
+ 确定中标供应商,邀请中国建筑标准设计研究院有限公司前来谈判,一、项目编号:0658-21711A60965,二
|
|
|
|
+ 、项目名称:人防工程技术咨询服务项目【重新招标】,三、凡被邀请参加谈判的供应商必须按照原招标文件第
|
|
|
|
+ 六章要求制作谈判文件正本一本,副本二本,按规定的时间密封递交并参加谈判,四、谈判内容:投标价格、项
|
|
|
|
+ 目实施方案、售后服务方案和其它相关事项,五、地点及时间:1、因疫情影响本项目谈判响应文件采用邮寄方
|
|
|
|
+ 式接收文件,2、文件接收截止时间:2021年11月5日14:30(北京时间),3、谈判响应文件邮寄地址:深圳
|
|
|
|
+ 市罗湖区嘉宾路2018号深华商业大厦裙楼6层600A。收件人:郑工,电话:18806665013,3、谈判地点:
|
|
|
|
+ 线上谈判,六、谈判的相关规则按原招标文件的相应规定执行;有关谈判事宜详见招标文件第六章《公开招标失
|
|
|
|
+ 败后后续采购程序和投标须知》,1、采购人信息,名称:中共深圳市委军民融合发展委员会办公室,地址:深
|
|
|
|
+ 圳市福田区新洲路5008号,联系方式:刘先生,电话:0755-88100332,2、采购代理机构信息,名称:深
|
|
|
|
+ 圳市国际招标有限公司,地址:罗湖总部:深圳市罗湖区嘉宾路2018号深华商业大厦裙楼6层,深圳湾总部:深
|
|
|
|
+ 圳市南山区沙河西路与白石路交汇处深圳湾科技生态园9栋B4座6楼,联系方式:0755-22918634,监督举报
|
|
|
|
+ 电话:0755-22965602、0755-86660475,特此通知,深圳市国际招标有限公司,2021年11月1日,更多
|
|
|
|
+ 咨询报价请点击:http://zbcloud.net/bidbulletin/69495.htm,
|
|
|
|
+ '''
|
|
|
|
+ print(extract_bidway(s, title=""))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def test_html():
|
|
|
|
+ html_path = "C:/Users/Administrator/Desktop/3.html"
|
|
|
|
+
|
|
|
|
+ with open(html_path, "r") as f:
|
|
|
|
+ s = f.read()
|
|
|
|
+
|
|
|
|
+ print(extract_bidway(s, title=""))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
+ # extract_bidway(s)
|
|
|
|
+
|
|
|
|
+ test_csv()
|
|
|
|
+ # test_str()
|
|
|
|
+ # test_html()
|
|
|
|
+
|