Explorar o código

公司名称补充优化,号码正则优化

znj %!s(int64=2) %!d(string=hai) anos
pai
achega
67a9b41c69

+ 28 - 8
BiddingKG/dl/interface/Preprocessing.py

@@ -2546,14 +2546,34 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
                 # 组织机构实体名称补充
                 if entity_type in ["org", "company"]:
-                    fix_name = re.search("(有限)([责贵]?任?)(公?司?)",entity_text)
-                    if fix_name:
-                        if len(fix_name.group(2))>0:
-                            entity_text = entity_text.replace(fix_name.group(), "有限责任公司")
-                        elif len(fix_name.group(3))>0:
-                            entity_text = entity_text.replace(fix_name.group(), "有限公司")
-                        elif re.search("有限$", entity_text):
-                            entity_text = re.sub("有限$","有限公司",entity_text)
+                    if not re.search("有限责任公司|有限公司",entity_text):
+                        fix_name = re.search("(有限)([责贵]?任?)(公?司?)",entity_text)
+                        if fix_name:
+                            if len(fix_name.group(2))>0:
+                                _text = fix_name.group()
+                                if '司' in _text:
+                                    entity_text = entity_text.replace(_text, "有限责任公司")
+                                else:
+                                    _text = re.search(_text + "[^司]{0,5}司", entity_text)
+                                    if _text:
+                                        _text = _text.group()
+                                        entity_text = entity_text.replace(_text, "有限责任公司")
+                                    else:
+                                        entity_text = entity_text.replace(entity_text[fix_name.start():], "有限责任公司")
+                            elif len(fix_name.group(3))>0:
+                                _text = fix_name.group()
+                                if '司' in _text:
+                                    entity_text = entity_text.replace(_text, "有限公司")
+                                else:
+                                    _text = re.search(_text + "[^司]{0,3}司", entity_text)
+                                    if _text:
+                                        _text = _text.group()
+                                        entity_text = entity_text.replace(_text, "有限公司")
+                                    else:
+                                        entity_text = entity_text.replace(entity_text[fix_name.start():], "有限公司")
+                            elif re.search("有限$", entity_text):
+                                entity_text = re.sub("有限$","有限公司",entity_text)
+                    entity_text = entity_text.replace("有公司","有限公司")
                 list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
             # 标记文章末尾的"发布人”、“发布时间”实体
             if sentence_index==len(list_sentence)-1:

+ 27 - 26
BiddingKG/dl/interface/getAttributes.py

@@ -509,8 +509,8 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
                         dict_role_combination[entity.packageName][str(_roleId)] = set([""])
                     dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
     list_real_comba = get_legal_comba(list_entity,dict_role_combination)
-  # print("===role_combination",dict_role_combination)
-  # print("== real_comba",list_real_comba)
+    # print("===role_combination",dict_role_combination)
+    # print("== real_comba",list_real_comba)
     #拿到最大期望值的组合
     max_index = 0
     max_expect = -100
@@ -1439,14 +1439,14 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     # 例:"采购人联系方式:0833-5226788,"
     phone_pattern = '(1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' \
                     '\+86.?1[3-9]\d{9}|' \
-                    '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' \
-                    '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|' \
-                    '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|' \
-                    '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|' \
-                   '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|' \
-                   '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|' \
-                   '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|' \
-                   '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|' \
+                    '0[1-9]\d{1,2}[-—-―][2-9]\d{6,7}/[1-9]\d{6,10}|' \
+                    '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?.?转\d{1,4}|' \
+                    '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|' \
+                    '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?(?=1[3-9]\d{9})|' \
+                   '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|' \
+                   '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?(?=[2-9]\d{6,7})|' \
+                   '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?|' \
+                   '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|' \
                    '[2-9]\d{6,7})'
     re_tenderee_phone = re.compile(
         "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
@@ -1515,13 +1515,12 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
                        '\+86.?1[3-9]\d{9}|'
                        # '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
-                       '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|'
-                       '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|'
-                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|'
-                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
-                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
-                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
-                       '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
+                       '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|'
+                       '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=1[3-9]\d{9})|'
+                       '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|'
+                       '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=[2-9]\d{6,7})|'
+                       '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?|'
+                       '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|'
                        '400\d{7}转\d{1,4}|'
                        '[2-9]\d{6,7}')
     url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[#$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
@@ -1670,7 +1669,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 if list_tokenbegin[j] >= item[2]:
                     end_index = j - 1
                     break
-            _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1],
+            phone_text = re.sub("[-—-―]+","-",item[0]).replace("(","(").replace(")",")")
+            _entity = Entity(_sentence.doc_id, None, phone_text, "phone", _sentence.sentence_index, begin_index, end_index, item[1],
                              item[2],in_attachment=in_attachment)
             phone_entitys.append(_entity)
     # print('phone_set:',set([ent.entity_text for ent in phone_entitys]))
@@ -2112,14 +2112,15 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     # 公司-联系人连接(km算法)
     re_phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
                        '\+86.?1[3-9]\d{9}|'
-                       '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
-                       '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|'
-                       '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|'
-                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|'
-                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
-                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
-                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
-                       '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
+                       # '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
+                       '0[1-9]\d{1,2}[-—-―][2-9]\d{6,7}[^\d]?转\d{1,4}|'
+                       '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|'
+                       '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=1[3-9]\d{9})|'
+                       '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|'
+                       '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=[2-9]\d{6,7})|'
+                       '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?|'
+                       '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6,7}-?\d{,4}|'
+                       '400\d{7}转\d{1,4}|'
                        '[2-9]\d{6,7}')
     key_phone = re.compile("联系方式|电话|联系人|负责人")
     temporary_list2 = []

+ 1 - 1
BiddingKG/dl/interface/predictor.py

@@ -1509,7 +1509,7 @@ class RoleRuleFinalAdd():
         sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
         sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
         sear_ent2 = re.search('[,:](户名|开户名称|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
-        sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点|)[,:](?P<entity>[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
+        sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点)[,:](?P<entity>[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
         sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000])
         sear_list = [sear_ent4 , sear_ent3 , sear_ent2 ,sear_ent1, sear_ent]