Эх сурвалжийг харах

招标联系人提取规则优化

znj 6 өдөр өмнө
parent
commit
4cac6d89c9

+ 3 - 3
BiddingKG/dl/channel/channel_bert.py

@@ -519,7 +519,7 @@ class_dict = {51: '公告变更',
 tenderee_type = ['公告变更','招标公告','招标预告','招标答疑','资审结果','采购意向']
 win_type = ['中标信息','废标公告','候选人公示','合同公告','开标记录','验收合同']
 
-def merge_channel(list_articles,channel_dic,original_docchannel):
+def merge_channel(list_articles,channel_dic,original_docchannel,web_source_no=""):
 
     def merge_rule(title,text,docchannel,pred_channel,channel_dic,original_docchannel):
         front_text_len = len(text)//3 if len(text)>300 else 100
@@ -601,8 +601,8 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
             channel_dic['docchannel']['docchannel'] = pred
             channel_dic['docchannel']['use_original_docchannel'] = 0
 
-    # '招标预告'类 规则纠正
-    if channel_dic['docchannel']['doctype']=='采招数据' and channel_dic['docchannel']['docchannel']=="招标公告":
+    # '招标预告'类 规则纠正,规则排除部分站源
+    if channel_dic['docchannel']['doctype']=='采招数据' and channel_dic['docchannel']['docchannel']=="招标公告" and web_source_no not in ['DX000027-1']:
         if "##attachment##" in text:
             main_text, attachment_text = text.split("##attachment##", maxsplit=1)
         else:

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -465,7 +465,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     else:
         channel_dic, msc = predictor.getPredictor("channel").final_change(channel_dic, prem[0], original_docchannel, msc)
     # print('msc', msc)
-    channel_dic = merge_channel(list_articles,channel_dic,original_docchannel) # channel_dic 根据新模型预测结合判断,整合结果
+    channel_dic = merge_channel(list_articles,channel_dic,original_docchannel,web_source_no) # channel_dic 根据新模型预测结合判断,整合结果
     cost_time["rule_channel2"] = round(time.time()-start_time,2)
 
     '''一包多中标人提取及所有金额提取'''

+ 29 - 22
BiddingKG/dl/interface/getAttributes.py

@@ -1292,11 +1292,18 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     agency_phone = set()
     winter_contact = set()
     rule_winter_phone = set()
+    tenderee_entity_set = set()
+    agency_entity_set = set()
     for _person in person_list:
         if _person.label == 1:
             tenderee_contact.add(_person.entity_text)
         if _person.label == 2:
             agency_contact.add(_person.entity_text)
+    for _entity in [entity for entity in list_entity if entity.entity_type in ['company','org']]:
+        if _entity.label==0:
+            tenderee_entity_set.add(_entity.entity_text)
+        elif _entity.label==1:
+            agency_entity_set.add(_entity.entity_text)
     # 正则匹配无 '主体/联系人' 的电话
     # 例:"采购人联系方式:0833-5226788,"
     phone_pattern = '(1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' \
@@ -1653,7 +1660,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                             # print('not_win_tenderer_contact1')
                             continue
                         # 角色为招标/代理人,排除"纪检|监察"相关的联系人
-                        if _subject.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin - 10):_object.wordOffset_begin]):
+                        if (_subject.label in [0,1] or _subject.entity_text in tenderee_entity_set|agency_entity_set) and re.search("纪检|监察|投诉|监督|乙方|中标",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin - 10):_object.wordOffset_begin]):
                         # if _subject.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[_object.sentence_index].sentence_text[_subject.end_index:_object.wordOffset_begin]):
                             continue
                         if _object.sentence_index!=0 and _object.wordOffset_begin<=10:
@@ -2041,7 +2048,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 "联系人|联系方式|电话|负责人|经理|法人|法定代表人", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
                             continue
                         # 角色为招标/代理人,排除"纪检|监察"相关的联系人
-                        if entity.label in [0, 1] and re.search("纪检|监察|乙方|中标", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
+                        if (entity.label in [0,1] or entity.entity_text in tenderee_entity_set|agency_entity_set) and re.search("纪检|监察|投诉|监督|乙方|中标", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
                             break
                         if after_entity.sentence_index != 0 and after_entity.wordOffset_begin <= 10:
                             if entity.label in [2, 3, 4] and re.search("请.{0,5}联系",list_sentence[after_entity.sentence_index - 1].sentence_text[-10:] +
@@ -2070,7 +2077,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 max(0, previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
                             continue
                         # 角色为招标/代理人,排除"纪检|监察"相关的联系人
-                        if entity.label in [0, 1] and re.search("纪检|监察|乙方|中标", list_sentence[previous_entity.sentence_index].sentence_text[
+                        if (entity.label in [0,1] or entity.entity_text in tenderee_entity_set|agency_entity_set) and re.search("纪检|监察|投诉|监督|乙方|中标", list_sentence[previous_entity.sentence_index].sentence_text[
                                                                                max(0,previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
                             break
                         if previous_entity.sentence_index == entity.sentence_index:
@@ -2219,7 +2226,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 if entity.label in [2, 3, 4] and not after_entity.person_phone and not re.search("联系人|联系方式|电话|负责人|经理|法人|法定代表人",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
                                     continue
                                 # 角色为招标/代理人,排除"纪检|监察"相关的联系人
-                                if entity.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
+                                if (entity.label in [0,1] or entity.entity_text in tenderee_entity_set|agency_entity_set) and re.search("纪检|监察|投诉|监督|乙方|中标",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
                                     break
                                 if after_entity.sentence_index != 0 and after_entity.wordOffset_begin <= 10:
                                     if entity.label in [2, 3, 4] and re.search("请.{0,5}联系",
@@ -2395,7 +2402,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                             list_sentence[previous_entity.sentence_index].sentence_text[max(0,previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
                                         continue
                                     # 角色为招标/代理人,排除"纪检|监察"相关的联系人
-                                    if entity.label in [0, 1] and re.search("纪检|监察|乙方|中标", list_sentence[previous_entity.sentence_index].sentence_text[
+                                    if (entity.label in [0,1] or entity.entity_text in tenderee_entity_set|agency_entity_set) and re.search("纪检|监察|投诉|监督|乙方|中标", list_sentence[previous_entity.sentence_index].sentence_text[
                                                                                            max(0,previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
                                         break
                                     if previous_entity.sentence_index == entity.sentence_index:
@@ -3163,23 +3170,23 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                             tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
                                             get_contacts = True
                                             break
-                    if not get_contacts:
-                        # 直接取文中倒数第一个联系人
-                        for _entity in temporary_list2[::-1]:
-                            if _entity.entity_type=='person' and _entity.label==3:
-                                if _entity.person_phone:
-                                    _phone = [p.entity_text for p in _entity.person_phone]
-                                    for _p in _phone:
-                                        if _entity.entity_text not in exist_person and _p not in ",".join(exist_phone):
-                                            tenderee_agency_role[0].linklist.append((_entity.entity_text, _p))
-                                            get_contacts = True
-                                    break
-                    if not get_contacts:
-                        # 如果文中只有一个“phone”实体,则直接取为联系人电话
-                        if len(phone_entitys) == 1:
-                            if phone_entitys[0].entity_text not in ",".join(exist_phone):
-                                tenderee_agency_role[0].linklist.append(("", phone_entitys[0].entity_text))
-                                get_contacts = True
+                    # if not get_contacts: # 会召回错误数据,不启用规则
+                    #     # 直接取文中倒数第一个联系人
+                    #     for _entity in temporary_list2[::-1]:
+                    #         if _entity.entity_type=='person' and _entity.label==3:
+                    #             if _entity.person_phone:
+                    #                 _phone = [p.entity_text for p in _entity.person_phone]
+                    #                 for _p in _phone:
+                    #                     if _entity.entity_text not in exist_person and _p not in ",".join(exist_phone):
+                    #                         tenderee_agency_role[0].linklist.append((_entity.entity_text, _p))
+                    #                         get_contacts = True
+                    #                 break
+                    # if not get_contacts: # 会召回错误数据,不启用规则
+                    #     # 如果文中只有一个“phone”实体,则直接取为联系人电话
+                    #     if len(phone_entitys) == 1:
+                    #         if phone_entitys[0].entity_text not in ",".join(exist_phone):
+                    #             tenderee_agency_role[0].linklist.append(("", phone_entitys[0].entity_text))
+                    #             get_contacts = True
                     if not get_contacts:
                         # 通过大纲Outline类直接取电话
                         if len(new_split_list) > 1: