Browse Source

新增招标人—联系人召回规则

znj 3 years ago
parent
commit
8e2997c813

+ 155 - 29
BiddingKG/dl/interface/getAttributes.py

@@ -1148,32 +1148,32 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
             agency_contact.add(_person.entity_text)
     # 正则匹配无 '主体/联系人' 的电话
     # 例:"采购人联系方式:0833-5226788,"
-    phone_pattern = '(1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' \
-                    '\+86.?1[3|4|5|6|7|8|9]\d{9}|' \
+    phone_pattern = '(1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' \
+                    '\+86.?1[3-9]\d{9}|' \
                     '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' \
                     '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|' \
                     '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|' \
-                    '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|' \
+                    '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|' \
                    '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|' \
                    '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|' \
                    '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|' \
                    '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|' \
                    '[2-9]\d{6,7})'
     re_tenderee_phone = re.compile(
-        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人)[::]?[^。]{0,7}?)"
+        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
         # 电话号码
         + phone_pattern)
     # 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788,"
     re_tenderee_phone2 = re.compile(
-        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[::]?[^。]{0,20}?)"
+        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
         # 电话号码
         + phone_pattern)
     re_agent_phone = re.compile(
-        "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人)[::]?[^。]{0,7}?)"
+        "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
         # 电话号码
         + phone_pattern)
     re_agent_phone2 = re.compile(
-        "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[::]?[^。]{0,20}?)"
+        "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
         # 电话号码
         + phone_pattern)
     content = ""
@@ -1224,18 +1224,22 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
     # 正则提取电话号码实体
     # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
-    phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
-                       '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
+    phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
+                       '\+86.?1[3-9]\d{9}|'
                        # '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
                        '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|'
                        '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|'
-                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|'
                        '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
                        '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
                        '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
                        '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
                        '[2-9]\d{6,7}')
+    url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
+    email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
+                            "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
     phone_entitys = []
+    code_entitys = [ent for ent in list_entity if ent.entity_type=='code']
     for _sentence in list_sentence:
         sentence_text = _sentence.sentence_text
         list_tokenbegin = []
@@ -1244,9 +1248,23 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
             list_tokenbegin.append(begin)
             begin += len(str(_sentence.tokens[i]))
         list_tokenbegin.append(begin + 1)
-
+        # 排除网址、邮箱、项目编号实体
+        error_list = []
+        for i in re.finditer(url_pattern, sentence_text):
+            error_list.append((i.start(), i.end()))
+        for i in re.finditer(email_pattern, sentence_text):
+            error_list.append((i.start(), i.end()))
+        for code_ent in [ent for ent in code_entitys if ent.sentence_index==_sentence.sentence_index]:
+            error_list.append((code_ent.wordOffset_begin,code_ent.wordOffset_end))
         res_set = set()
         for i in re.finditer(phone, sentence_text):
+            is_continue = False
+            for error_ent in error_list:
+                if i.start()>=error_ent[0] and i.end()<=error_ent[1]:
+                    is_continue = True
+                    break
+            if is_continue:
+                continue
             res_set.add((i.group(), i.start(), i.end()))
         res_set = sorted(list(res_set),key=lambda x:x[1])
         last_phone_mask = True
@@ -1254,24 +1272,43 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
             item = res_set[item_idx]
             phone_left = sentence_text[max(0, item[1] - 10):item[1]]
             phone_right = sentence_text[item[2]:item[2] + 8]
-            # 排除“传真号”和其它错误项
-            if re.search("传,?真|信,?箱|邮,?[箱件]", phone_left):
-                if not re.search("电,?话", phone_left):
+            if re.search("电话|手机|联系人|联系方式”",re.sub(",","",phone_left)):
+                pass
+            else:
+                # 排除“传真号”和其它错误项
+                if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left):
+                    if not re.search("电,?话", phone_left):
+                        last_phone_mask = False
+                        continue
+                if re.search("注册[证号]|帐号|编[号码]|报价|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
                     last_phone_mask = False
                     continue
-            if re.search("注册[证号]|帐,?号|编,?[号码]|报,?价|标,?价|证,?号|价,?格|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", phone_left):
-                last_phone_mask = False
-                continue
-            if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right):
-                last_phone_mask = False
-                continue
-            # if:上一个phone实体不符合条件
-            if not last_phone_mask:
-                item_start = item[1]
-                last_item_end = res_set[item_idx-1][2]
-                if item_start - last_item_end<=1 or re.search("^\d+$",sentence_text[last_item_end:item_start]):
+                if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right):
+                    last_phone_mask = False
+                    continue
+                # 前后跟着字母
+                if re.search("[a-zA-Z/]+$", phone_left) or re.search("^[a-zA-Z/]+", phone_right):
                     last_phone_mask = False
                     continue
+                # 前后跟着长度小于一定值数字的正则排除
+                if re.search("\d+[-—-―]?\d*$",phone_left) or re.search("^\d+[-—-―]?\d*",phone_right):
+                    phone_left_number = re.search("\d+[-—-―]?\d*$",phone_left)
+                    phone_right_number = re.search("^\d+[-—-―]?\d+",phone_right)
+                    if phone_left_number:
+                        if len(phone_left_number.group())<7:
+                            last_phone_mask = False
+                            continue
+                    if phone_right_number:
+                        if len(phone_right_number.group())<7:
+                            last_phone_mask = False
+                            continue
+                # if:上一个phone实体不符合条件
+                if not last_phone_mask:
+                    item_start = item[1]
+                    last_item_end = res_set[item_idx-1][2]
+                    if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—-―]+$",sentence_text[last_item_end:item_start]):
+                        last_phone_mask = False
+                        continue
             for j in range(len(list_tokenbegin)):
                 if list_tokenbegin[j] == item[1]:
                     begin_index = j
@@ -1295,7 +1332,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
         if ent.is_tail==True:
             return False
         entity_left = text[max(0,entity.wordOffset_begin-10):entity.wordOffset_begin]
-        entity_left = re.sub(",()\(\)::","",entity_left)
+        entity_left = re.sub(",()\(\)","",entity_left)
         entity_left = entity_left[-5:]
         if re.search("地址|地点|银行[::]",entity_left):
             return False
@@ -1410,6 +1447,43 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                                 PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
                                 break
                 # print(3,combo[0].entity_text,combo[1].entity_text)
+    # 2022/01/25 固定电话可连多个联系人
+    temp_person_entitys = [entity for entity in pre_entity if entity.entity_type == 'person']
+    temp_person_entitys2 = [] #和固定电话相连的联系人
+    for entity in temp_person_entitys:
+        if entity.person_phone:
+            for _phone in entity.person_phone:
+                if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
+                    temp_person_entitys2.append(entity)
+                    break
+    for index in range(len(temp_person_entitys)):
+        entity = temp_person_entitys[index]
+        if entity in temp_person_entitys2:
+            last_person = entity
+            for after_index in range(index + 1, min(len(temp_person_entitys), index + 5)):
+                after_entity = temp_person_entitys[after_index]
+                if after_entity.sentence_index == last_person.sentence_index and after_entity.begin_index - last_person.end_index < 3:
+                    for _phone in entity.person_phone:
+                        if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
+                            if _phone not in after_entity.person_phone:
+                                after_entity.person_phone.append(_phone)
+                    last_person = after_entity
+                else:
+                    break
+            if index==0:
+                continue
+            last_person = entity
+            for before_index in range(index-1, max(-1,index-5), -1):
+                before_entity = temp_person_entitys[before_index]
+                if before_entity.sentence_index == last_person.sentence_index and last_person.begin_index - before_entity.end_index < 3:
+                    for _phone in entity.person_phone:
+                        if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
+                            if _phone not in before_entity.person_phone:
+                                before_entity.person_phone.append(_phone)
+                    last_person = before_entity
+                else:
+                    break
+
     # 更新 PackDict
     not_sure_linked = []
     for link_p in list(linked_company):
@@ -1527,12 +1601,12 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
         last_words_num = len(sentence.sentence_text)
 
     # 公司-联系人连接(km算法)
-    re_phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
-                       '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
+    re_phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
+                       '\+86.?1[3-9]\d{9}|'
                        '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
                        '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|'
                        '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|'
-                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|'
                        '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
                        '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
                        '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
@@ -1661,6 +1735,13 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                                                 match_nums += 1
                         else:
                             next_entity = split_entitys[index + 1]
+                            if next_entity.entity_type in ["org","company"]:
+                                _entity_left = list_sentence[next_entity.sentence_index].sentence_text[max(0, next_entity.wordOffset_begin - 10):next_entity.wordOffset_begin]
+                                _entity_left = re.sub(",()\(\)::", "", _entity_left)
+                                _entity_left = _entity_left[-5:]
+                                if re.search("地址|地点", _entity_left):
+                                    if index + 2<= len(split_entitys) - 1:
+                                        next_entity = split_entitys[index + 2]
                             if entity.sentence_index == next_entity.sentence_index:
                                 mid_tokens += list_sentence[entity.sentence_index].tokens[
                                               entity.end_index + 1:next_entity.begin_index]
@@ -2271,6 +2352,51 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                                 for _p in _phone:
                                     PackDict[k]["roleList"][0].linklist.append((_entity.entity_text, _p))
                                 break
+    # 公告中只有"招标人",无"联系人"链接且上一条规则无效时时,如果文中只有一个“phone”实体,则直接取为联系人电话
+    if len(PackDict)==1:
+        k = list(PackDict.keys())[0]
+        if len(PackDict[k]["roleList"])==1:
+            if PackDict[k]["roleList"][0].role_name == "tenderee":
+                if not PackDict[k]["roleList"][0].linklist:
+                    if len(phone_entitys)==1:
+                        PackDict[k]["roleList"][0].linklist.append(("", phone_entitys[0].entity_text))
+    # 公告中只有"招标人",无"联系人"链接且上一条规则无效时时,通过大纲直接取电话
+    if len(PackDict)==1:
+        k = list(PackDict.keys())[0]
+        if len(PackDict[k]["roleList"])==1:
+            if PackDict[k]["roleList"][0].role_name == "tenderee":
+                if not PackDict[k]["roleList"][0].linklist:
+                    if len(new_split_list)>1:
+                        for _start,_end in new_split_list:
+                            temp_sentence = _content[_start:_end]
+                            sentence_outline = temp_sentence.split(",")[0]
+                            if re.search("联系人|联系方|联系方式|联系电话|电话|负责人",sentence_outline):
+                                sentence_phone = phone.findall(temp_sentence)
+                                if sentence_phone:
+                                    PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
+                                    break
+    # 公告中只有"招标人",无"联系人"链接且上一条规则无效时时,通过正则提取句子段落进行提取电话
+    if len(PackDict)==1:
+        k = list(PackDict.keys())[0]
+        if len(PackDict[k]["roleList"])==1:
+            if PackDict[k]["roleList"][0].role_name == "tenderee":
+                if not PackDict[k]["roleList"][0].linklist:
+                    contacts_person = "(?:联系人|联系方|联系方式|负责人|电话|联系电话)[::]?"
+                    tenderee_pattern = "(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主|业主单位)[^。]{0,5}"
+                    contact_pattern_list = [tenderee_pattern + contacts_person,"(?:采购[^。,]{0,2}项目|采购事项|招标)[^。,]{0,4}" + contacts_person,
+                                            "(?:项目|采购)[^。,]{0,4}"+contacts_person,"(?:报名|报价|业务咨询|业务|投标咨询)[^。,]{0,4}"+contacts_person,]
+                    for _pattern in contact_pattern_list:
+                        get_tenderee_contacts = False
+                        for regular_match in re.finditer(_pattern,_content):
+                            match_text = _content[regular_match.end():regular_match.end()+40]
+                            match_text = match_text.split("。")[0]
+                            sentence_phone = phone.findall(match_text)
+                            if sentence_phone:
+                                PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
+                                get_tenderee_contacts = True
+                                break
+                        if get_tenderee_contacts:
+                            break
 
     for pack in PackDict.keys():
         for i in range(len(PackDict[pack]["roleList"])):

+ 3 - 0
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -90,6 +90,7 @@ def predict(doc_id,text):
     #             print(entity.entity_text, entity.pointer_person,entity.label,entity.values)
     #             pass
     roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
+    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_entitys)
     # print("epcPredict")
     epcPredict.predict(list_sentences,list_entitys)
 
@@ -164,6 +165,8 @@ def predict(doc_id,text):
             #         print('pointer_pack_name:',entity.pointer_pack.entity_text)
             # elif entity.entity_type =='money':
             #     print('money',entity.entity_text,entity.label)
+            # elif entity.entity_type =='phone':
+            #     print('phone',entity.entity_text)
             # elif entity.entity_type =='name':
             #     print('pj_name',entity.entity_text,entity.sentence_index,entity.begin_index)
             # elif entity.entity_type in ['package']: