fangjiasheng 3 лет назад
Родитель
Сommit
e17c8d50d7

+ 430 - 214
BiddingKG/dl/bidway/re_bidway.py

@@ -1,222 +1,384 @@
 import ast
-
 import pandas as pd
 import re
 
-from BiddingKG.dl.interface import Entitys
-
-def re_bidway(text):
-    # df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
-
-    reg = re.compile(u'(采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式'
-                     u'|发包方式|发包类型|开展方式|招标类型)(.*)'
-                     u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
-                     u'|电子书面竞投|邀请招标|定向公开|询价采购|抽签摇号'
-                     u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
-                     u'|网上招标|其他'
-                     u'|竞谈竞价|网上直购|公开竞谈'
-                     u'|库内邀请|库内公开发包)')
-
-    # reg = re.compile(u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源|网络竞价'
-    #                  u'|竞争性谈判|公开询价|邀请招标|公开招募|公开询比价|电子书面竞投'
-    #                  u'|网上电子投标|比质比价|定向询单|国内比选|电子竞价'
-    #                  u'|公开招租|公开竞标方式|网上招标|公开招标|国内竞争性谈判'
-    #                  u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
-    #                  u'|库内邀请|询价采购|询比采购|分散采购|单一来源采购)')
-
-    reg2 = re.compile(u'(采用|以|)'
-                      u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
-                      u'|竞争性谈判|询价|电子书面竞投|电子竞价'
-                      u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
-                      u'|网上招标|分散采购'
-                      u'|竞谈竞价|网上直购|公开竞谈'
-                      u'|库内邀请)'
-                      u'(采购方式|方式)')
-
-    reg1 = re.compile(
-        # u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源采购|网络竞价|公开招商方式'
-        # u'|竞争性谈判|公开询价|询价采购|邀请招标|公开招募|公开询比|电子书面竞投'
-        # u'|网上电子投标|比质比价|定向询单|询比采购|国内比选|单一来源|公开选取|库内公开发包'
-        # u'|公开招租|公开竞标方式|网上招标|公开招标|竞争性谈判|公开招投标'
-        # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
-        # u'|国际公开竞争性招标)'
-        u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
-        u'|竞争性谈判|询价|电子书面竞投'
-        u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
-        u'|网上招标|分散采购'
-        u'|竞谈竞价|网上直购|公开竞谈'
-        u'|库内邀请)'
-    )
-
-    # 都切为4个字符
-    # reg1_not = re.compile(u'(及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录)')
-    reg1_not = re.compile(u'(及单一来|价小组成|除单一来|性谈判邀|询价记录)')
-
-    reg3 = re.compile(u'(采购方式:邀请|采购方式:公开|采购方式:询价|分散采购|公开招标|竞价|磋商|询比|竞标|邀请招标|公开招募|公开招租)')
-
-
-    reg_standard = re.compile(u'(公开招标|竞争性磋商|竞争性谈判|单一来源'
-                              u'|竞争性谈判|询价|邀请招标|公开招募|询比|电子书面竞投'
-                              u'|网上电子投标|比质比价|询单|比选'
-                              u'|公开招租|网上招标|分散采购'
-                              u'|网上直购|公开竞谈|采购方式:邀请|采购方式:公开|采购方式:询价)'
-                              )
-
-    # text_list = df["text"].to_list()
-    text_list = []
-    text_list.append(text)
-    text_index_list = []
-    output_list = []
-    for index in range(len(text_list)):
-        # 全文下标
-        text_index = [0, 0]
-
-        input_str = text_list[index]
-
-        # 把一些混淆的词先替换掉
-        input_str = re.sub(reg1_not, "####", input_str)
-
-        match = reg.search(input_str)
-        output_str = None
-        # 根据正则表达式匹配
-        if match:
-            # 更新全文下标
-            text_index[0] = match.start()
-            text_index[1] = match.end()
-
-            # 判断长度,截断
-            if len(match.group()) >= 15:
-                ss = re.split(",|\.|,|。|;|;", match.group())
-                # 判断所需的字符串在哪一段
-                for i in range(len(ss)):
-                    if re.search(reg1, ss[i]):
-                        output_str = ss[i]
-
-                        # 更新全文下标
-                        front_len, back_len = calculateLen(ss, i)
-                        text_index[0] = text_index[0] + front_len + i
-                        text_index[1] = text_index[1] - back_len + len(ss) -1 - i
-
-                        break
+# from BiddingKG.dl.interface import Entitys
+
+
+# def re_bidway_old(text):
+#     df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
+#
+#     reg = re.compile(u'(采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式'
+#                      u'|发包方式|发包类型|开展方式|招标类型)(.*)'
+#                      u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
+#                      u'|电子书面竞投|邀请招标|定向公开|询价采购|抽签摇号'
+#                      u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
+#                      u'|网上招标|其他'
+#                      u'|竞谈竞价|网上直购|公开竞谈'
+#                      u'|库内邀请|库内公开发包)')
+#
+#     # reg = re.compile(u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源|网络竞价'
+#     #                  u'|竞争性谈判|公开询价|邀请招标|公开招募|公开询比价|电子书面竞投'
+#     #                  u'|网上电子投标|比质比价|定向询单|国内比选|电子竞价'
+#     #                  u'|公开招租|公开竞标方式|网上招标|公开招标|国内竞争性谈判'
+#     #                  u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
+#     #                  u'|库内邀请|询价采购|询比采购|分散采购|单一来源采购)')
+#
+#     reg2 = re.compile(u'(采用|以|)'
+#                       u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
+#                       u'|竞争性谈判|询价|电子书面竞投|电子竞价'
+#                       u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
+#                       u'|网上招标|分散采购'
+#                       u'|竞谈竞价|网上直购|公开竞谈'
+#                       u'|库内邀请)'
+#                       u'(采购方式|方式)')
+#
+#     reg1 = re.compile(
+#         # u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源采购|网络竞价|公开招商方式'
+#         # u'|竞争性谈判|公开询价|询价采购|邀请招标|公开招募|公开询比|电子书面竞投'
+#         # u'|网上电子投标|比质比价|定向询单|询比采购|国内比选|单一来源|公开选取|库内公开发包'
+#         # u'|公开招租|公开竞标方式|网上招标|公开招标|竞争性谈判|公开招投标'
+#         # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
+#         # u'|国际公开竞争性招标)'
+#         u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
+#         u'|竞争性谈判|询价|电子书面竞投'
+#         u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
+#         u'|网上招标|分散采购'
+#         u'|竞谈竞价|网上直购|公开竞谈'
+#         u'|库内邀请)'
+#     )
+#
+#     # 都切为4个字符
+#     # reg1_not = re.compile(u'(及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录)')
+#     reg1_not = re.compile(u'(及单一来|价小组成|除单一来|性谈判邀|询价记录)')
+#
+#     reg3 = re.compile(u'(采购方式:邀请|采购方式:公开|采购方式:询价|分散采购|公开招标|竞价|磋商|询比|竞标|邀请招标|公开招募|公开招租)')
+#
+#
+#     reg_standard = re.compile(u'(公开招标|竞争性磋商|竞争性谈判|单一来源'
+#                               u'|竞争性谈判|询价|邀请招标|公开招募|询比|电子书面竞投'
+#                               u'|网上电子投标|比质比价|询单|比选'
+#                               u'|公开招租|网上招标|分散采购'
+#                               u'|网上直购|公开竞谈|采购方式:邀请|采购方式:公开|采购方式:询价)'
+#                               )
+#
+#     text_list = df["text"].to_list()
+#     # text_list = []
+#     # text_list.append(text)
+#     text_index_list = []
+#     output_list = []
+#     for index in range(len(text_list)):
+#         # 全文下标
+#         text_index = [0, 0]
+#
+#         input_str = text_list[index]
+#
+#         # 把一些混淆的词先替换掉
+#         input_str = re.sub(reg1_not, "####", input_str)
+#
+#         match = reg.search(input_str)
+#         output_str = None
+#         # 根据正则表达式匹配
+#         if match:
+#             # 更新全文下标
+#             text_index[0] = match.start()
+#             text_index[1] = match.end()
+#
+#             # 判断长度,截断
+#             if len(match.group()) >= 15:
+#                 ss = re.split(",|\.|,|。|;|;", match.group())
+#                 # 判断所需的字符串在哪一段
+#                 for i in range(len(ss)):
+#                     if re.search(reg1, ss[i]):
+#                         output_str = ss[i]
+#
+#                         # 更新全文下标
+#                         front_len, back_len = calculateLen(ss, i)
+#                         text_index[0] = text_index[0] + front_len + i
+#                         text_index[1] = text_index[1] - back_len + len(ss) -1 - i
+#
+#                         break
+#             else:
+#                 output_str = match.group()
+#
+#         else:
+#             match2 = re.search(reg2, input_str)
+#             if match2:
+#                 # 更新全文下标
+#                 text_index[0] = match2.start()
+#                 text_index[1] = match2.end()
+#
+#                 output_str = match2.group()
+#
+#             else:
+#                 match1 = re.search(reg1, input_str)
+#                 if match1:
+#                     # 更新全文下标
+#                     text_index[0] = match1.start()
+#                     text_index[1] = match1.end()
+#                     output_str = match1.group()
+#
+#         # 再判断一次长度
+#         if output_str is not None:
+#             if len(output_str) >= 15:
+#                 match2 = re.search(reg2, input_str)
+#                 if match2:
+#                     # 更新全文下标
+#                     text_index[0] = match2.start()
+#                     text_index[1] = match2.end()
+#
+#                     output_str = match2.group()
+#             if len(output_str) >= 15:
+#                 match1 = re.search(reg1, input_str)
+#                 if match1:
+#                     # 更新全文下标
+#                     text_index[0] = match1.start()
+#                     text_index[1] = match1.end()
+#
+#                     output_str = match1.group()
+#
+#         # 最后输出还为空,匹配一些易混淆的词
+#         if output_str is None:
+#             match3 = re.search(reg3, input_str)
+#             if match3:
+#                 # 更新全文下标
+#                 text_index[0] = match3.start()
+#                 text_index[1] = match3.end()
+#
+#                 output_str = match3.group()
+#
+#         # 处理前缀等无用词
+#         if output_str is not None:
+#             match5 = re.search("分散采购|采购方式:邀请", output_str)
+#             if not match5:
+#                 # 公开采购转为公开招标
+#                 output_str = re.sub("公开采购", "公开招标", output_str)
+#
+#                 # 去掉第一个字符冒号
+#                 ss = re.split(":|:", output_str)
+#                 output_str = ss[-1]
+#                 # 更新全文下标
+#                 front_len, back_len = calculateLen(ss, len(ss) - 1)
+#                 text_index[0] = text_index[0] + front_len + len(ss) - 1
+#
+#                 # 去掉采购、方式、采用
+#                 match6 = re.search("(采用|出售|直接(|现就本次|招标为)", output_str)
+#                 match7 = re.search("(采购|方式|进行)", output_str)
+#                 output_str = re.sub("(采购|方式|采用|出售|进行|直接(|现就本次|招标为)", "", output_str)
+#                 # 更新全文下标
+#                 if match6:
+#                     text_index[0] += match6.end() - match6.start()
+#                 if match7:
+#                     text_index[1] -= match7.end() - match7.start()
+#
+#             # 使用标准标签过滤
+#             match4 = re.search(reg_standard, output_str)
+#             if match4:
+#                 output_str = match4.group()
+#                 # 更新全文下标
+#                 text_index[0] += match4.start()
+#                 text_index[1] = text_index[0] + match4.end() - match4.start()
+#
+#         output_list.append(output_str)
+#         # text_index_list.append(str(text_index))
+#         text_index_list.append(text_index)
+#
+#     # df["re"] = pd.DataFrame(output_list)
+#     # df["text_index"] = pd.DataFrame(text_index_list)
+#
+#     # index_to_word = []
+#     # for index, row in df.iterrows():
+#     #     i_list = ast.literal_eval(row["text_index"])
+#     #     word = row["text"][i_list[0]:i_list[1]]
+#     #     if len(word) >= 20:
+#     #         word = ""
+#     #     index_to_word.append(word)
+#
+#
+#     # df["index2word"] = pd.DataFrame(index_to_word)
+#     # df.to_csv("C:\\Users\\admin\\Desktop\\bidway_text2.csv")
+#
+#     return output_list[0], text_index_list[0]
+
+
+bidway = '单一来源' \
+         '|国内竞争性磋商|竞争性磋商|竞争性谈判|网络竞价|网上竞价|公开竞谈|公开竞价|电子竞价|竞价|竞标|竞谈竞价|电子书面竞投' \
+         '|公开比选|比质比价|比选' \
+         '|公开招标|公开招租|公开招募|公开选取|公开招投标' \
+         '|网上直购|网上招标|网上电子投标|网上挂牌' \
+         '|邀请招标' \
+         '|网上询价|公开询价|非定向询价|定向询价|询比价|询单|询价|询比' \
+         '|库内邀请|库内公开发包|内部邀标' \
+         '|定点采购议价|定点采购' \
+         '|竞争性评审'
+
+not_bidway = '及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录|自由竞价' \
+             '|限时竞价|咨询单位|询价单'
+
+not_bidway_preffix = "本次|拟|参加|无效|标的|联合体|参与|否决|除"
+
+not_bidway_suffix = "文件|报名|邀请|项目|失败|数量|编号|后|时间|类型|名称|和|成交" \
+                    "|标题|开始|结束|产品|报价|供应商|部门|监督|需求|范围|入围|内容|人" \
+                    "|条件|公司|保证金|完毕|事件|成功|活动|地点|标|会|须知|范围" \
+                    "|响应|报价|采购公示|的原因|采购供应商|价|采购人员|失败"
+
+bidway_preffix = '采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式' \
+                 '|发包方式|发包类型|开展方式|招标类型|选取方式|招租方式'
+
+bidway_special = '采购方式:公开|采购方式:邀请|采购方式:询价' \
+                 '|招标方式:.公开|采购方式:.公开' \
+                 '|分散采购' \
+                 ''
+
+
+def re_not_bidway(_str):
+    match = re.findall(not_bidway, _str)
+    if match:
+        for word in match:
+            instead = "#" * len(word)
+            _str = re.sub(word, instead, _str)
+
+    reg_not1 = "(" + bidway + ")" + "(" + not_bidway_suffix + ")"
+    match = re.findall(reg_not1, _str)
+    if match:
+        for word in match:
+            word_add = ""
+            for w in word:
+                word_add += w
+            instead = "#" * len(word_add)
+            _str = re.sub(word_add, instead, _str)
+
+    reg_not2 = "(" + not_bidway_preffix + ")" + "(" + bidway + ")"
+    match = re.findall(reg_not2, _str)
+    if match:
+        for word in match:
+            word_add = ""
+            for w in word:
+                word_add += w
+            instead = "#" * len(word_add)
+            _str = re.sub(word_add, instead, _str)
+    return _str
+
+
+def re_standard_bidway(_str):
+    reg_standard = "(?P<preffix>" + bidway_preffix + ")" \
+                   + "(?P<char>.{1,2})" \
+                   + "(?P<value>" + bidway + ")"
+    match = re.finditer(reg_standard, _str)
+    bidway_list = []
+    if match:
+        for m in match:
+            m_dict = m.groupdict()
+            m_span = m.span()
+            keyword = ""
+            keyword_index = [m_span[0], m_span[1]]
+            for key in m_dict.keys():
+                if key == "value":
+                    keyword = m_dict.get(key)
+                else:
+                    keyword_index[0] += len(m_dict.get(key))
+            bidway_list.append([keyword, keyword_index])
+
+    return bidway_list
+
+
+def re_all_bidway(_str):
+    reg_all = "(?P<value>" + bidway + ")"
+    match = re.finditer(reg_all, _str)
+    bidway_list = []
+    if match:
+        for m in match:
+            keyword = m.group()
+            keyword_index = list(m.span())
+            bidway_list.append([keyword, keyword_index])
+    return bidway_list
+
+
+def re_special_bidway(_str):
+    reg_special = "(?P<value>" + bidway_special + ")"
+    match = re.finditer(reg_special, _str)
+    bidway_list = []
+    if match:
+        for m in match:
+            keyword = m.group()
+            keyword_index = list(m.span())
+            bidway_list.append([keyword, keyword_index])
+    return bidway_list
+
+
+def get_one_word(bidway_list):
+    # 若有多个,去重,输出较长的
+    word = None
+    text_index = [0, 0]
+    if len(bidway_list) > 1:
+        word_dict = {}
+        for bw in bidway_list:
+            if bw[0] in word_dict.keys():
+                if bw[1][0] < word_dict.get(bw[0])[0]:
+                    word_dict[bw[0]] = bw[1]
             else:
-                output_str = match.group()
-
+                word_dict[bw[0]] = bw[1]
+
+        word_list = []
+        for key in word_dict.keys():
+            word_list.append([key, word_dict.get(key)[0]])
+
+        if len(word_list) > 1:
+            word_list.sort(key=lambda x: (-int(x[1]), len(x[0])))
+            word = word_list[-1][0]
+            text_index = word_dict.get(word)
+        elif word_list:
+            word = word_list[0][0]
+            text_index = word_dict.get(word)
         else:
-            match2 = re.search(reg2, input_str)
-            if match2:
-                # 更新全文下标
-                text_index[0] = match2.start()
-                text_index[1] = match2.end()
-
-                output_str = match2.group()
-
-            else:
-                match1 = re.search(reg1, input_str)
-                if match1:
-                    # 更新全文下标
-                    text_index[0] = match1.start()
-                    text_index[1] = match1.end()
-                    output_str = match1.group()
-
-        # 再判断一次长度
-        if output_str is not None:
-            if len(output_str) >= 15:
-                match2 = re.search(reg2, input_str)
-                if match2:
-                    # 更新全文下标
-                    text_index[0] = match2.start()
-                    text_index[1] = match2.end()
-
-                    output_str = match2.group()
-            if len(output_str) >= 15:
-                match1 = re.search(reg1, input_str)
-                if match1:
-                    # 更新全文下标
-                    text_index[0] = match1.start()
-                    text_index[1] = match1.end()
-
-                    output_str = match1.group()
-
-        # 最后输出还为空,匹配一些易混淆的词
-        if output_str is None:
-            match3 = re.search(reg3, input_str)
-            if match3:
-                # 更新全文下标
-                text_index[0] = match3.start()
-                text_index[1] = match3.end()
-
-                output_str = match3.group()
-
-        # 处理前缀等无用词
-        if output_str is not None:
-            match5 = re.search("分散采购|采购方式:邀请", output_str)
-            if not match5:
-                # 公开采购转为公开招标
-                output_str = re.sub("公开采购", "公开招标", output_str)
-
-                # 去掉第一个字符冒号
-                ss = re.split(":|:", output_str)
-                output_str = ss[-1]
-                # 更新全文下标
-                front_len, back_len = calculateLen(ss, len(ss) - 1)
-                text_index[0] = text_index[0] + front_len + len(ss) - 1
-
-                # 去掉采购、方式、采用
-                match6 = re.search("(采用|出售|直接(|现就本次|招标为)", output_str)
-                match7 = re.search("(采购|方式|进行)", output_str)
-                output_str = re.sub("(采购|方式|采用|出售|进行|直接(|现就本次|招标为)", "", output_str)
-                # 更新全文下标
-                if match6:
-                    text_index[0] += match6.end() - match6.start()
-                if match7:
-                    text_index[1] -= match7.end() - match7.start()
-
-            # 使用标准标签过滤
-            match4 = re.search(reg_standard, output_str)
-            if match4:
-                output_str = match4.group()
-                # 更新全文下标
-                text_index[0] += match4.start()
-                text_index[1] = text_index[0] + match4.end() - match4.start()
-
-        output_list.append(output_str)
-        # text_index_list.append(str(text_index))
-        text_index_list.append(text_index)
-
-    # df["re"] = pd.DataFrame(output_list)
-    # df["text_index"] = pd.DataFrame(text_index_list)
-
-    # index_to_word = []
-    # for index, row in df.iterrows():
-    #     i_list = ast.literal_eval(row["text_index"])
-    #     word = row["text"][i_list[0]:i_list[1]]
-    #     if len(word) >= 20:
-    #         word = ""
-    #     index_to_word.append(word)
-
-
-    # df["index2word"] = pd.DataFrame(index_to_word)
-    # df.to_csv("C:\\Users\\admin\\Desktop\\bidway_text2.csv")
-
-    return output_list[0], text_index_list[0]
-
-
-def calculateLen(ss, i):
-    front_len = 0
-    back_len = 0
-    for index in range(i):
-        front_len += len(ss[index])
-    for index in range(i+1, len(ss)):
-        back_len += len(ss[index])
-    return front_len, back_len
-
-
-def extract_bidway(text):
+            text_index = [0, 0]
+    elif len(bidway_list) == 1:
+        word = bidway_list[0][0]
+        text_index = bidway_list[0][1]
+    return word, text_index
+
+
+def re_bidway(text, title):
+    # 替换易混淆词
+    text_clean = re_not_bidway(text)
+    title_clean = re_not_bidway(title)
+
+    # 查找符合标准形式的
+    bidway_list = re_standard_bidway(text_clean)
+    if bidway_list:
+        word = bidway_list[0][0]
+        text_index = bidway_list[0][1]
+        return word, text_index
+
+    # 无符合标准形式的,查找title里的所有形式
+    bidway_list = re_all_bidway(title_clean)
+    if bidway_list:
+        word, text_index = get_one_word(bidway_list)
+        return word, text_index
+
+    # 无符合标准形式的,查找所有形式
+    bidway_list = re_all_bidway(text_clean)
+    if bidway_list:
+        word, text_index = get_one_word(bidway_list)
+        return word, text_index
+
+    # 还无结果,查找特殊形式
+    bidway_list = re_special_bidway(text_clean)
+    if bidway_list:
+        word = bidway_list[0][0]
+        text_index = bidway_list[0][1]
+        return word, text_index
+
+    # 查无结果
+    return None, [0, 0]
+
+
+def extract_bidway(text, title):
     list_bidway = []
-    word, text_index_list = re_bidway(text)
+    word, text_index_list = re_bidway(text, title)
     if word is not None:
         if text_index_list[1]-text_index_list[0] != len(word) \
-                or text_index_list[1]-text_index_list[0] >= 30:
+                or text_index_list[1]-text_index_list[0] >= 10:
             return []
         d = {"body": word, "begin_index": text_index_list[0], "end_index": text_index_list[1]}
         list_bidway.append(d)
@@ -224,8 +386,62 @@ def extract_bidway(text):
     return list_bidway
 
 
-if __name__ == "__main__":
-    # df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
-    # s = df["text"].iloc[1]
+def test_csv():
+    df = pd.read_csv("C:\\Users\\Administrator\\Desktop\\bidway_text.csv")
+
+    predict_list = []
+    for index, row in df.iterrows():
+        word, text_index = re_bidway(row["text"], "")
+        if word:
+            predict = [word, text_index]
+        else:
+            predict = []
+        print("predict", predict)
+        predict_list.append(str(predict))
+
+    predict_df = pd.DataFrame(predict_list)
+    df = pd.concat([df, predict_df], axis=1)
+
+    df.to_csv("C:\\Users\\Administrator\\Desktop\\bidway_result.csv")
+    print("finish write!")
+
+
+def test_str():
     s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
-    extract_bidway(s)
+    s = '''
+    ,关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知,各投标人:深圳市国际招标有限公司受中共
+    深圳市委军民融合发展委员会办公室委托,就人防工程技术咨询服务项目【重新招标】(项目编号:0658-2171
+    1A60965),进行公开招标,因投标单位不足三家,公开招标失败,现经采购单位同意,采用单一来源谈判方式
+    确定中标供应商,邀请中国建筑标准设计研究院有限公司前来谈判,一、项目编号:0658-21711A60965,二
+    、项目名称:人防工程技术咨询服务项目【重新招标】,三、凡被邀请参加谈判的供应商必须按照原招标文件第
+    六章要求制作谈判文件正本一本,副本二本,按规定的时间密封递交并参加谈判,四、谈判内容:投标价格、项
+    目实施方案、售后服务方案和其它相关事项,五、地点及时间:1、因疫情影响本项目谈判响应文件采用邮寄方
+    式接收文件,2、文件接收截止时间:2021年11月5日14:30(北京时间),3、谈判响应文件邮寄地址:深圳
+    市罗湖区嘉宾路2018号深华商业大厦裙楼6层600A。收件人:郑工,电话:18806665013,3、谈判地点:
+    线上谈判,六、谈判的相关规则按原招标文件的相应规定执行;有关谈判事宜详见招标文件第六章《公开招标失
+    败后后续采购程序和投标须知》,1、采购人信息,名称:中共深圳市委军民融合发展委员会办公室,地址:深
+    圳市福田区新洲路5008号,联系方式:刘先生,电话:0755-88100332,2、采购代理机构信息,名称:深
+    圳市国际招标有限公司,地址:罗湖总部:深圳市罗湖区嘉宾路2018号深华商业大厦裙楼6层,深圳湾总部:深
+    圳市南山区沙河西路与白石路交汇处深圳湾科技生态园9栋B4座6楼,联系方式:0755-22918634,监督举报
+    电话:0755-22965602、0755-86660475,特此通知,深圳市国际招标有限公司,2021年11月1日,更多
+    咨询报价请点击:http://zbcloud.net/bidbulletin/69495.htm,
+    '''
+    print(extract_bidway(s, title=""))
+
+
+def test_html():
+    html_path = "C:/Users/Administrator/Desktop/3.html"
+
+    with open(html_path, "r") as f:
+        s = f.read()
+
+    print(extract_bidway(s, title=""))
+
+
+if __name__ == "__main__":
+    # extract_bidway(s)
+
+    test_csv()
+    # test_str()
+    # test_html()
+

+ 2 - 1
BiddingKG/dl/interface/Entitys.py

@@ -29,7 +29,7 @@ class Article():
     @summary:文章类
     '''
     
-    def __init__(self,id,content,sourceContent,doc_id,title,code="",name=""):
+    def __init__(self,id,content,sourceContent,doc_id,title,code="",name="",bidway=""):
         '''
         @param:
             id:文章的uuid
@@ -46,6 +46,7 @@ class Article():
         self.match_enterprise = []
         self.match_enterprise_type = 0
         self.attachmentTypes = self.getAttachmentTypes(sourceContent)
+        self.bidway = bidway
 
     def toJson(self):
         _dict = {"id":self.id,"content":self.content,"code":self.code,

+ 25 - 16
BiddingKG/dl/interface/Preprocessing.py

@@ -1473,6 +1473,14 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         key_preprocess = "tableToText"
         start_time = time.time()
         article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
+
+        # 提取bidway
+        list_bidway = extract_bidway(article_processed, _title)
+        if list_bidway:
+            bidway = list_bidway[0].get("body")
+        else:
+            bidway = ""
+
         # 修正被","逗号分隔的时间
         repair_time = re.compile("[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?[日号]?,?(?:上午|下午)?,?[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d|"
                                  "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?[日号]?,?(?:上午|下午)?,?[0-2]?\d,?[:时点],?[0-6]\d分?|"
@@ -1500,7 +1508,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         cost_time[key_preprocess] += time.time()-start_time
 
         #article_processed = article[1]
-        _article = Article(doc_id,article_processed,sourceContent,_send_doc_id,_title)
+        _article = Article(doc_id,article_processed,sourceContent,_send_doc_id,_title,
+                           bidway=bidway)
         _article.fingerprint = getFingerprint(_title+sourceContent)
         list_articles.append(_article)
     return list_articles
@@ -1943,21 +1952,21 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                            begin_index_temp, end_index_temp))
 
             # 招标方式提取 2020/12/30 新增
-            list_bidway = extract_bidway(sentence_text)
-            entity_type = "bidway"
-            for bidway in list_bidway:
-                begin_index_temp = bidway['begin_index']
-                end_index_temp = bidway['end_index']
-                begin_index = changeIndexFromWordToWords(tokens, begin_index_temp)
-                end_index = changeIndexFromWordToWords(tokens, end_index_temp)
-                if begin_index is None or end_index is None:
-                    continue
-                print(begin_index_temp,end_index_temp,begin_index,end_index)
-                entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
-                entity_text = bidway['body']
-                list_sentence_entitys.append(
-                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
-                           begin_index_temp, end_index_temp))
+            # list_bidway = extract_bidway(sentence_text, )
+            # entity_type = "bidway"
+            # for bidway in list_bidway:
+            #     begin_index_temp = bidway['begin_index']
+            #     end_index_temp = bidway['end_index']
+            #     begin_index = changeIndexFromWordToWords(tokens, begin_index_temp)
+            #     end_index = changeIndexFromWordToWords(tokens, end_index_temp)
+            #     if begin_index is None or end_index is None:
+            #         continue
+            #     print(begin_index_temp,end_index_temp,begin_index,end_index)
+            #     entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
+            #     entity_text = bidway['body']
+            #     list_sentence_entitys.append(
+            #         Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
+            #                begin_index_temp, end_index_temp))
 
             list_sentence_entitys.sort(key=lambda x:x.begin_index)
             list_entitys_temp = list_entitys_temp+list_sentence_entitys

+ 2 - 3
BiddingKG/dl/interface/getAttributes.py

@@ -2062,8 +2062,7 @@ def turnBidWay(bidway):
         return "其他"
 
 def getOtherAttributes(list_entity):
-    dict_other = {"bidway":"",
-                  "moneysource":"",
+    dict_other = {"moneysource":"",
                   "person_review":[],
                   "time_release":"",
                   "time_bidopen":"",
@@ -2111,7 +2110,7 @@ def getPREMs(list_sentences,list_entitys,list_articles):
         result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),
                            **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
                               "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
-                              "attachmentTypes":list_article.attachmentTypes}))
+                              "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway}))
     return result
 
 

Разница между файлами не показана из-за своего большого размера
+ 0 - 0
BiddingKG/dl/test/test4.py


Некоторые файлы не были показаны из-за большого количества измененных файлов