Browse Source

中标联系人提取修复

znj 1 month ago
parent
commit
5792057ebd

+ 4 - 3
BiddingKG/dl/interface/extract.py

@@ -318,10 +318,11 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     start_time = time.time()
     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
-    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy = extract_parameters(parse_document)
+    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy, winter_scope = extract_parameters(parse_document)
+
     if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
         parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
-        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy = extract_parameters(parse_document)
+        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy, winter_scope = extract_parameters(parse_document)
     # print('out_lines',out_lines)
     # if addr_bidopen_text == '':
     #     addr_bidopen_text = extract_addr(list_articles[0].content)
@@ -430,7 +431,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     entityLink.link_entitys(list_entitys)
     doctitle_refine = entityLink.doctitle_refine(title)
     nlp_enterprise,nlp_enterprise_attachment, dict_enterprise = entityLink.get_nlp_enterprise(list_entitys[0])
-    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time)
+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time,winter_scope)
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
 

+ 219 - 33
BiddingKG/dl/interface/getAttributes.py

@@ -888,7 +888,10 @@ def dispatch(match_list):
     main_roles = list(set([match.main_role for match in match_list]))
     # print('main_roles',[i.entity_text for i in main_roles])
     attributes = list(set([match.attribute for match in match_list]))
-    # print('attributes',[i.entity_text for i in attributes])
+    # try:
+    #     print('attributes',[i.entity_text for i in attributes])
+    # except:
+    #     pass
 
     label = np.zeros(shape=(len(main_roles), len(attributes)))
     for match in match_list:
@@ -907,7 +910,7 @@ def dispatch(match_list):
 from BiddingKG.dl.common.Utils import getUnifyMoney
 from BiddingKG.dl.interface.modelFactory import Model_relation_extraction
 relationExtraction_model = Model_relation_extraction()
-def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_sentence,list_entity,list_outline,on_value = 0.5,on_value_person=0.5,sentence_len=4):
+def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_sentence,list_entity,list_outline,winter_scope,on_value = 0.5,on_value_person=0.5,sentence_len=4):
     '''
     @param:
         PackDict:文章包dict
@@ -1286,6 +1289,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     agency_contact = set()
     agency_phone = set()
     winter_contact = set()
+    rule_winter_phone = set()
     for _person in person_list:
         if _person.label == 1:
             tenderee_contact.add(_person.entity_text)
@@ -1305,22 +1309,39 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                    '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|' \
                    '[2-9]\d{6,7})'
     re_tenderee_phone = re.compile(
-        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。代理]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
+        # "(?:(?:(?:采购|招标|议价|议标|比选|业主|委托)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)(?:单位)?[^。代理]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
+        "(?:(?:(?:遴选|寻源|采购|招标|竞价|议价|比选|(?:[^受被]|^)委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求?|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选)"
+        "(?:人|方|商|单位|组织|用户|业主|主体|部门|公司|企业))(?:单位)?[^。代理]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
         # 电话号码
         + phone_pattern)
     # 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788,"
     re_tenderee_phone2 = re.compile(
-        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。代理]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
+        # "(?:(?:(?:采购|招标|议价|议标|比选|业主)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)(?:单位)?[^。代理]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
+        "(?:(?:(?:遴选|寻源|采购|招标|竞价|议价|比选|(?:[^受被]|^)委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求?|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选)"
+        "(?:人|方|商|单位|组织|用户|业主|主体|部门|公司|企业))(?:单位)?[^。代理]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话|联系人和联系方式)[::]?[^。]{0,20}?)"
         # 电话号码
         + phone_pattern)
     re_agent_phone = re.compile(
-        "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
+        "(?:(?:(?:代理|[受被]委托)(?:人|方|商|机构|公司|单位|组织|企业)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人|联系电话|联系人和联系方式)[::]?[^。]{0,7}?)"
         # 电话号码
         + phone_pattern)
     re_agent_phone2 = re.compile(
-        "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
+        "(?:(?:(?:代理|[受被]委托)(?:人|方|商|机构|公司|单位|组织|企业)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话|联系人和联系方式)[::]?[^。]{0,20}?)"
         # 电话号码
         + phone_pattern)
+    re_win_tenderer_phone = re.compile(
+        "(?:(?:(?:乙|竞得|受让|买受|签约|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(?:(包))?|入围|入选|竞买|中标|中选|中价|中签|成交|候选)"
+        "(?:候选|投标)?(?:人|单位|(?:中介)?(?:服务)?机构|供应商|客户|方|公司|企业|厂商|商|社会资本方?)|选定单位|中[标选]银行|成交对象)[^。审核]{0,5}(?:负责人|联系人|项目)?(?:经理|电话|联系方式|联系人|负责人|联系电话|联系人和联系方式)[::]?[^。]{0,7}?)"
+        + phone_pattern)
+    re_win_tenderer_phone2 = re.compile(
+        "(?:(?:(?:乙|竞得|受让|买受|签约|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(?:(包))?|入围|入选|竞买|中标|中选|中价|中签|成交|候选)"
+        "(?:候选|投标)?(?:人|单位|(?:中介)?(?:服务)?机构|供应商|客户|方|公司|企业|厂商|商|社会资本方?)|选定单位|中[标选]银行|成交对象)[^。]{0,3}(?:地址)[^。审核]{0,3}(?:负责人|联系人|项目)?(?:经理|电话|联系方式|联系人|负责人|联系电话|联系人和联系方式)[::]?[^。]{0,20}?)"
+        + phone_pattern)
+    not_win_tenderer_contact = re.compile("纪检|监察|质疑|投诉|监督|受理|请.{0,4}(联系|与)"
+                                          "|(遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求?|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选|发布|代理|拍卖|转出){1,2}"
+                                          "(人|方|商|单位|组织|用户|业主|主体|部门|公司|企业|工厂|银行|机构){0,2}"
+                                          "[\u4e00-\u9fa5]{0,4}(联系|咨询|电话)(人|电话|方式)?")
+
     content = ""
     for _sentence in list_sentence:
         content += "".join(_sentence.tokens)
@@ -1366,6 +1387,20 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                     for one_phone in _phone:
                         PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
                         agency_phone.add(one_phone)
+    # 中标人联系方式规则筛选
+    _winter_phone = re.findall(re_win_tenderer_phone, content)
+    if _winter_phone:
+        for _phone in _winter_phone:
+            _phone = _phone.split("/")
+            for one_phone in _phone:
+                rule_winter_phone.add(one_phone)
+    _winter_phone2 = re.findall(re_win_tenderer_phone2, content)
+    if _winter_phone2:
+        for _phone in _winter_phone2:
+            _phone = _phone.split("/")
+            for one_phone in _phone:
+                rule_winter_phone.add(one_phone)
+
     # 正则提取电话号码实体
     # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
     phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
@@ -1443,7 +1478,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         error_numStr_index.append(numStr_index)
                         last_phone_mask = False
                         continue
-                if re.search("身份证号?码?|注册[证号]|帐号|编[号码]|报价|费率|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
+                if re.search("身份证号?码?|注册[证号]|帐号|编[号码]|报价|费率|标价|证号|资格证|资质|价格|金额|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
                     error_numStr_index.append(numStr_index)
                     last_phone_mask = False
                     continue
@@ -1611,7 +1646,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
                             continue
                         # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
-                        if _subject.label in [2,3,4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
+                        # if _subject.label in [2,3,4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
+                        if _subject.label in [2,3,4] and re.search(not_win_tenderer_contact,list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-15):_object.wordOffset_begin]):
+                            # print('not_win_tenderer_contact1')
                             continue
                         # 角色为招标/代理人,排除"纪检|监察"相关的联系人
                         if _subject.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin - 10):_object.wordOffset_begin]):
@@ -1678,7 +1715,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                             if temp.begin_index>combo[0].begin_index:
                                 is_continue = True
                                 break
-                if is_continue: continue
+                if is_continue:
+                    continue
                 combo[0].person_phone.append(combo[1])
                 linked_connetPerson.add(combo[0])
                 linked_phone.add(combo[1])
@@ -1913,9 +1951,11 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                             person_phone = [phone for phone in per.person_phone] if per.person_phone else []
                             if not person_phone:
                                 if per.entity_text not in tenderee_contact and per.entity_text not in agency_contact:
-                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
-                                    winter_contact.add(per.entity_text)
-                                    continue
+                                    # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
+                                    if re.search("联系人|联系方式|电话|负责人|经理|法人|法定代表人",list_sentence[per.sentence_index].sentence_text[max(0, per.wordOffset_begin - 10):per.wordOffset_begin]):
+                                        PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
+                                        winter_contact.add(per.entity_text)
+                                        continue
                             for _p in person_phone:
                                 if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and \
                                         per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
@@ -1947,6 +1987,122 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                             for _p in person_phone:
                                 if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and per.entity_text not in winter_contact:
                                     PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
+
+    # 使用中标信息大纲提取联系人
+    winter_scope_group = []
+    if winter_scope:
+        winter_scope_begin = winter_scope[0]
+        winter_scope_end = winter_scope[1]
+        # print(list_sentence[winter_scope_begin[0]].sentence_text[winter_scope_begin[1]:winter_scope_end[1]])
+        winter_temporary_list = []
+        for entity in list_entity:
+            if entity.entity_type in ['org', 'company', 'person']:
+                winter_temporary_list.append(entity)
+        winter_temporary_list = sorted(winter_temporary_list, key=lambda x: (x.sentence_index, x.begin_index))
+        winter_temporary_list2 = []
+        for _entity in winter_temporary_list:
+            if _entity.sentence_index>=winter_scope_begin[0] and _entity.sentence_index<=winter_scope_end[0]:
+                if (_entity.sentence_index==winter_scope_begin[0] and _entity.wordOffset_begin>=winter_scope_begin[1]) or \
+                        _entity.sentence_index>winter_scope_begin[0]:
+                    if (_entity.sentence_index == winter_scope_end[0] and _entity.wordOffset_end<=winter_scope_end[1]) or \
+                            _entity.sentence_index<winter_scope_end[0]:
+                        winter_temporary_list2.append(_entity)
+        # print('winter_scope_entity',[i.entity_text for i in winter_temporary_list2])
+        winter_scope_group = winter_temporary_list2
+
+        match_list_winter = []
+        for index in range(len(winter_scope_group)):
+            entity = winter_scope_group[index]
+            if entity.entity_type in ['company','org']:
+                match_nums = 0
+                for after_index in range(index + 1, min(len(winter_scope_group), index + 4)):
+                    after_entity = winter_scope_group[after_index]
+                    if match_nums > 2:
+                        break
+                    if after_entity.entity_type == 'person':
+                        distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
+                                tokens_num_dict[entity.sentence_index] + entity.end_index)
+                        # 实体为中标人/候选人,联系人已确定类别【1,2】
+                        if entity.label in [2, 3, 4] and after_entity.label in [1, 2]:
+                            break
+                        if entity.label in [2, 3, 4] and distance >= 30:
+                            break
+                        # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
+                        if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact, list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 15):after_entity.wordOffset_begin]):
+                            break
+                        # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
+                        if entity.label in [2, 3, 4] and not after_entity.person_phone and not re.search(
+                                "联系人|联系方式|电话|负责人|经理|法人|法定代表人", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
+                            continue
+                        # 角色为招标/代理人,排除"纪检|监察"相关的联系人
+                        if entity.label in [0, 1] and re.search("纪检|监察|乙方|中标", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
+                            break
+                        if after_entity.sentence_index != 0 and after_entity.wordOffset_begin <= 10:
+                            if entity.label in [2, 3, 4] and re.search("请.{0,5}联系",list_sentence[after_entity.sentence_index - 1].sentence_text[-10:] +
+                                                                       list_sentence[after_entity.sentence_index].sentence_text[0:after_entity.wordOffset_begin]):
+                                continue
+                        if distance < 80:
+                            if (entity.label == 0 and after_entity.label == 1) or (
+                                    entity.label == 1 and after_entity.label == 2):
+                                distance = distance / 100
+                            value = (-1 / 2 * (distance ** 2)) / 10000
+                            match_list_winter.append(Match(entity, after_entity, value))
+                            match_nums += 1
+                # 前向查找匹配
+                if index != 0:
+                    previous_entity = winter_scope_group[index - 1]
+                    if previous_entity.entity_type == 'person' and previous_entity.label in [1,2,3]:
+                        if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
+                            continue
+                        # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
+                        if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact, list_sentence[previous_entity.sentence_index].sentence_text[
+                                                                                             max(0,previous_entity.wordOffset_begin - 15):previous_entity.wordOffset_begin]):
+                            break
+                        # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
+                        if entity.label in [2, 3, 4] and not previous_entity.person_phone and not re.search(
+                                "联系人|联系方式|电话|负责人|经理|法人|法定代表人",list_sentence[previous_entity.sentence_index].sentence_text[
+                                max(0, previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
+                            continue
+                        # 角色为招标/代理人,排除"纪检|监察"相关的联系人
+                        if entity.label in [0, 1] and re.search("纪检|监察|乙方|中标", list_sentence[previous_entity.sentence_index].sentence_text[
+                                                                               max(0,previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
+                            break
+                        if previous_entity.sentence_index == entity.sentence_index:
+                            distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
+                                    tokens_num_dict[
+                                        previous_entity.sentence_index] + previous_entity.end_index)
+                            if distance < 30:
+                                # 距离相等时,前向添加处罚值
+                                # distance += 1
+                                # 前向 没有 /10000
+                                value = (-1 / 2 * (distance ** 2))
+                                match_list_winter.append(Match(entity, previous_entity, value))
+        # test
+        # match_list_winter = company_contact_link([winter_scope_group])
+        # km算法分配求解
+        result_winter = dispatch(match_list_winter)
+        for match in result_winter:
+            _company = match[0]
+            _person = match[1]
+            _person = _person.entity_text
+            # 更新中标人联系方式
+            if _company.label==2:
+                phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else []
+                for k in PackDict.keys():
+                    for i in range(len(PackDict[k]["roleList"])):
+                        if PackDict[k]["roleList"][i].role_name == "win_tenderer":
+                            if PackDict[k]["roleList"][i].entity_text == _company.entity_text:
+                                if _person not in tenderee_contact and len(set(phone_) & set(tenderee_phone)) == 0 and \
+                                        _person not in agency_contact and len(set(phone_) & set(agency_phone)) == 0:
+                                    if not phone_:
+                                        PackDict[k]["roleList"][i].linklist.append((_person, ""))
+                                    for p in phone_:
+                                        PackDict[k]["roleList"][i].linklist.append((_person, p))
+                if phone_:
+                    for p in phone_:
+                        rule_winter_phone.add(p)
+                    # print('rule_winter_phone',rule_winter_phone)
+
     re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、")
     split_list = [0] * 16
     split_dict = {
@@ -2045,11 +2201,17 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 # 实体为中标人/候选人,联系人已确定类别【1,2】
                                 if entity.label in [2, 3, 4] and after_entity.label in [1, 2]:
                                     break
-                                if entity.label in [2, 3, 4] and distance>=20:
+                                if entity.label in [2, 3, 4] and distance>=30:
                                     break
                                 # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
-                                if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|(采购|招标)人?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
+                                # if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位|业主)?联系|(采购|招标)人?联系|请.{0,4}联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
+                                if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact, list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 15):after_entity.wordOffset_begin]):
+                                    # print('not_win_tenderer_contact2')
                                     break
+                                # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
+                                # print('test',after_entity.entity_text,after_entity.person_phone,list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin])
+                                if entity.label in [2, 3, 4] and not after_entity.person_phone and not re.search("联系人|联系方式|电话|负责人|经理|法人|法定代表人",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
+                                    continue
                                 # 角色为招标/代理人,排除"纪检|监察"相关的联系人
                                 if entity.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
                                     break
@@ -2136,7 +2298,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                             new_split_list[split_index][1]:
                                         mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
                                         if re.search(key_phone, mid_sentence):
-                                            if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系",mid_sentence[-8:]):
+                                            # if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位|业主)?联系|(采购|招标)人?联系|请.{0,4}联系",mid_sentence[-10:]):
+                                            if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact,mid_sentence[-15:]):
+                                                # print('not_win_tenderer_contact3')
                                                 pass
                                             else:
                                                 distance = 1
@@ -2189,7 +2353,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                         p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else []
                                         if next_entity.entity_type == 'person' and _phone in p_phone:
                                             pass
-                                        elif entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系", mid_sentence[-8:]):
+                                        # elif entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位|业主)?联系|(采购|招标)人?联系|请.{0,4}联系", mid_sentence[-10:]):
+                                        elif entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact, mid_sentence[-15:]):
+                                            # print('not_win_tenderer_contact4')
                                             pass
                                         else:
                                             distance = (tokens_num_dict[
@@ -2213,6 +2379,19 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]:
                                     if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
                                         continue
+                                    # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
+                                    if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact,list_sentence[previous_entity.sentence_index].sentence_text[
+                                                                               max(0,previous_entity.wordOffset_begin - 15):previous_entity.wordOffset_begin]):
+                                        # print('not_win_tenderer_contact2')
+                                        break
+                                    # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
+                                    if entity.label in [2, 3,4] and not previous_entity.person_phone and not re.search("联系人|联系方式|电话|负责人|经理|法人|法定代表人",
+                                            list_sentence[previous_entity.sentence_index].sentence_text[max(0,previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
+                                        continue
+                                    # 角色为招标/代理人,排除"纪检|监察"相关的联系人
+                                    if entity.label in [0, 1] and re.search("纪检|监察|乙方|中标", list_sentence[previous_entity.sentence_index].sentence_text[
+                                                                                           max(0,previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
+                                        break
                                     if previous_entity.sentence_index == entity.sentence_index:
                                         distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
                                                 tokens_num_dict[
@@ -2384,10 +2563,12 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
     # "roleList"中联系人电话去重
     tenderee_agency_phone = []
+    tenderee_agency_contact = []
     for k in PackDict.keys():
         for i in range(len(PackDict[k]["roleList"])):
             if PackDict[k]["roleList"][i].role_name in ['agency','tenderee']:
                 tenderee_agency_phone.extend([person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[1]])
+                tenderee_agency_contact.extend([person_phone[0]+'-'+person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist])
             # 带有联系人的电话
             with_person = [person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[0]]
             # 带有电话的联系人
@@ -2407,22 +2588,25 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     for k in PackDict.keys():
         for i in range(len(PackDict[k]["roleList"])):
             if PackDict[k]["roleList"][i].role_name in ['win_tenderer', 'second_tenderer','third_tenderer']:
-                if tenderee_agency_phone:
+                if tenderee_agency_phone or tenderee_agency_contact:
                     remove_list = []
                     for item in PackDict[k]["roleList"][i].linklist:
                         if item[1] and item[1] in tenderee_agency_phone:
                             remove_list.append(item)
+                        elif item[0]+'-'+item[1] in tenderee_agency_contact:
+                            remove_list.append(item)
+                    for _item in remove_list:
+                        PackDict[k]["roleList"][i].linklist.remove(_item)
+                elif not tenderee_agency_phone:
+                    # 公告中无招标代理联系方式时,可排除中标联系方式
+                    remove_list = []
+                    for _item in PackDict[k]["roleList"][i].linklist:
+                        # 排除非正则规则识别的联系方式
+                        if _item[1] not in rule_winter_phone:
+                            remove_list.append(_item)
+                    # print('remove_list',remove_list)
                     for _item in remove_list:
                         PackDict[k]["roleList"][i].linklist.remove(_item)
-                # else:
-                #     # 公告中无招标代理联系方式时,可排除中标联系方式
-                #     remove_list = []
-                #     for _item in PackDict[k]["roleList"][i].linklist:
-                #         # 有联系方式
-                #         if _item[1]:
-                #             remove_list.append(_item)
-                #     for _item in remove_list:
-                #         PackDict[k]["roleList"][i].linklist.remove(_item)
     # PackDict更新company/org地址
     last_role_prob = {}
     for ent in pre_entity:
@@ -2934,7 +3118,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     # 公告中只有"招标人"且无"联系人"链接时
     if len(PackDict)==1:
         k = list(PackDict.keys())[0]
-        tenderee_agency_role = [role for role in PackDict[k]["roleList"] if role.role_name in ['tenderee','agency']]
+        tenderee_agency_role = [role for role in PackDict[k]["roleList"] if role.role_name in ['tenderee','agency','win_tenderer']]
         if len(tenderee_agency_role)==1:
             exist_person = []
             exist_phone = []
@@ -2951,7 +3135,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                     if not get_contacts:
                         # 根据大纲Outline类召回联系人
                         for outline in list_outline:
-                            if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系",outline.outline_summary):
+                            if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系",outline.outline_summary) and \
+                                    not re.search("代理|乙方|竞得|受让|买受|签约|供货|供应|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(?:(包))?|入围|入选|竞买|中标|中选|中价|中签|成交|候选",outline.outline_summary):
                                 for t_person in [p for p in temporary_list2 if p.entity_type=='person' and p.label==3]:
                                     if words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= words_num_dict[outline.sentence_begin_index] + outline.wordOffset_begin and words_num_dict[
                                         t_person.sentence_index] + t_person.wordOffset_end < words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
@@ -2995,7 +3180,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                             for _start, _end in new_split_list:
                                 temp_sentence = _content[_start:_end]
                                 sentence_outline = temp_sentence.split(",::")[0]
-                                if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline):
+                                if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline) and \
+                                        not re.search("代理|乙方|竞得|受让|买受|签约|供货|供应|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(?:(包))?|入围|入选|竞买|中标|中选|中价|中签|成交|候选",sentence_outline):
                                     sentence_phone = phone.findall(temp_sentence)
                                     if sentence_phone:
                                         if sentence_phone[0] in [ent.entity_text for ent in phone_entitys] and sentence_phone[0] not in ",".join(exist_phone):
@@ -3069,7 +3255,7 @@ def initPackageAttr(RoleList,PackageSet,win_tenderer_set,tenderee_or_agency_set,
         packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,item.role_prob,0,0.0,[],set(item.multi_winner)-win_tenderer_set-tenderee_or_agency_set)) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,多中标人)
     return packDict
                 
-def getPackageRoleMoney(list_sentence,list_entity,list_outline):
+def getPackageRoleMoney(list_sentence,list_entity,list_outline,winter_scope):
     '''
     @param:
         list_sentence:文章的句子list
@@ -3089,7 +3275,7 @@ def getPackageRoleMoney(list_sentence,list_entity,list_outline):
     # PackDict = initPackageAttr(RoleList, PackageSet)
     PackDict = initPackageAttr(RoleList, PackageSet, win_tenderer_set,tenderee_or_agency_set,main_body_pack)
 
-    PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline)
+    PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline, winter_scope)
     return PackDict
 
 def turnBidWay(bidway):
@@ -4272,7 +4458,7 @@ def getProjectContacts(list_entity, list_sentence):
 
     return {'project_contacts':project_contacts_list}
 
-def getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time):
+def getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time,winter_scope):
     '''
     @param:
         list_sentence:所有文章的句子list
@@ -4281,7 +4467,7 @@ def getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time):
     '''
     result = []
     for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines):
-        RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline)
+        RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline,winter_scope)
         result.append(dict({"prem": RoleList, "docid": list_article.doc_id},
                            **getTimeAttributes(list_entity, list_sentence,page_time),
                            **getProjectContacts(list_entity, list_sentence),

+ 26 - 1
BiddingKG/dl/interface/outline_extractor.py

@@ -57,6 +57,8 @@ def extract_sentence_list(sentence_list):
 requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
                           "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
                       "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([::,]|$)"
+winter_pattern = "((乙方|竞得|受让|买受|签约|供货|供应|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(?:(包))?|入围|入选|竞买|中标|中选|中价|中签|成交|候选)[\u4e00-\u9fa5]{0,5}" \
+                        "(公示)?(信息|概况|情况|名称|联系人|联系方式|负责人)|中标公示单位)为?([::,、]|$)"
 aptitude_pattern = "资质(资格)要求|资格(资质)要求|单位要求|资质及业绩要求|((资格|资质|准入)[的及]?(要求|条件|标准|限定|门槛)|竞买资格及要求|供应商报价须知)|按以下要求参与竞买|((报名|应征|竞买|投标|竞投|受让|报价|竞价|竞包|竞租|承租|申请|参与|参选|遴选)的?(人|方|单位|企业|客户|机构)?|供应商|受让方)((必?须|需|应[该当]?)(具备|满足|符合|提供)+以?下?)?的?(一般|基本|主要)?(条件|要求|资格(能力)?|资质)+|乙方应当符合下列要求|参与比选条件|合格的投标人|询价要求"
 
 addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)|开启([::,]|$)"
@@ -77,6 +79,7 @@ def extract_parameters(parse_document):
     addr_bidopen_text = '' # 开标地址
     addr_bidsend_text = '' # 投标地址
     requirement_scope = [] # 采购内容始末位置
+    winter_scope = [] # 中标信息始末位置
     pinmu_name = '' # 品目名称
     list_policy = [] # 政策法规
     out_lines = []
@@ -107,6 +110,28 @@ def extract_parameters(parse_document):
                     requirement_scope.append(e)
                     _data_i += len(childs)
                     _data_i -= 1
+
+    _data_i = -1
+    # 中标信息
+    while _data_i<len(list_data)-1:
+        _data_i += 1
+        _data = list_data[_data_i]
+        _type = _data["type"]
+        _text = _data["text"].strip()
+        # print(_data.keys())
+        if _type=="sentence":
+            # print('_text',_text)
+            # print('sentence_title',_data["sentence_title"])
+            if _data["sentence_title"] is not None:
+                if re.search(winter_pattern,_text[:30]) is not None:
+                    b = (_data['sentence_index'], _data['wordOffset_begin'])
+                    childs = get_childs([_data])
+                    e = (childs[-1]['sentence_index'], childs[-1]["wordOffset_end"]) if len(childs)>0 else (_data['sentence_index'], _data['wordOffset_end'])
+                    winter_scope.append(b)
+                    winter_scope.append(e)
+                    _data_i += len(childs)
+                    _data_i -= 1
+
     _data_i = -1
     while _data_i<len(list_data)-1:
         _data_i += 1
@@ -196,7 +221,7 @@ def extract_parameters(parse_document):
         pinmu_name = pinmu_name[ser.end():]
         if re.search('[^\w]$', pinmu_name):
             pinmu_name = pinmu_name[:-1]
-    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy
+    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy, winter_scope
 
 def extract_addr(content):
     '''