Parcourir la source

联系人正则补充优化

znj il y a 3 ans
Parent
commit
f87dcbef5b

+ 10 - 4
BiddingKG/dl/interface/Preprocessing.py

@@ -1078,9 +1078,11 @@ def segment(soup,final=True):
     text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
     #替换为中文分号
     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
+    # 感叹号替换为中文句号
+    text = re.sub("(?<=[\u4e00-\u9fa5])[!!]|[!!](?=[\u4e00-\u9fa5])","。",text)
     #替换"?"为 " " ,update:2021/7/20
     text = re.sub("?"," ",text)
-         
+
 
     #替换"""为"“",否则导入deepdive出错
     text = text.replace('"',"“").replace("\r","").replace("\n",",")
@@ -1925,10 +1927,11 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
             # "联系人"正则补充提取  2021/11/15 新增
             list_person_text = [entity.entity_text for entity in list_sentence_entitys if entity.entity_type=='person']
-            error_text = ['传真','网址','电子邮','联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理','代理人','采购','附件']
+            error_text = ['交易','机构','教育','项目','公司','中标','开标','截标','监督','政府','国家','中国','技术','投标','传真','网址','电子邮',
+                          '联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理','代理人','采购','附件']
             list_person_text = set(list_person_text + error_text)
-            re_person = re.compile("联系人[::]([\u4e00-\u9fa5]{2,3})(?=联系)|"
-                                   "联系人[::]([\u4e00-\u9fa5])|"
+            re_person = re.compile("联系人[::]([\u4e00-\u9fa5])|"
+                                   "联系人[::]([\u4e00-\u9fa5]{2,3})(?=联系)|"
                                    "联系人[::]([\u4e00-\u9fa5]{2,3})")
             list_person = []
             for match_result in re_person.finditer(sentence_text):
@@ -1937,6 +1940,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 wordOffset_begin = match_result.start() + 4
                 wordOffset_end = match_result.end()
                 # print(text[wordOffset_begin:wordOffset_end])
+                # 排除一些不为人名的实体
+                if re.search("^[\u4e00-\u9fa5]{7,}([,。]|$)",sentence_text[wordOffset_begin:wordOffset_begin+20]):
+                    continue
                 if entity_text not in list_person_text and entity_text[:2] not in list_person_text:
                     _person = dict()
                     _person['body'] = entity_text

+ 2 - 2
BiddingKG/dl/interface/getAttributes.py

@@ -1129,10 +1129,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                 if not re.search("电,?话", phone_left):
                     last_phone_mask = False
                     continue
-            if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]", phone_left):
+            if re.search("注册[证号]|帐,?号|编,?[码]|报,?价|证,?号|价,?格|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", phone_left):
                 last_phone_mask = False
                 continue
-            if re.search("^\d{0,4}[.,]\d{2,}", phone_right):
+            if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+", phone_right):
                 last_phone_mask = False
                 continue
             # if:上一个phone实体不符合条件

+ 2 - 1
BiddingKG/dl/test/test4.py

@@ -68,7 +68,8 @@ if __name__=="__main__":
     # '''
     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
     # print(predict("12", text))
+    print(predict("12", content))
     # test("12",text)
-    test("12",content)
+    # test("12",content)
     print("takes",time.time()-_time1)
     pass

+ 2 - 0
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -427,6 +427,8 @@ if __name__=="__main__":
     # 传真:0769-81216222,邮编:523000。(三),采购人:东莞市道滘镇教育管理中心,地址:广东省东莞市道滘镇花园街1号,联系人:李先生,联系电话:0769-81332303,传真:/,邮编:523000,各有关当事人对中标、成交结果有异议的,可以在中标、成交公告发布之日起7个工作日内以书面形式向(政府采购代理机构)(或采购人)提出质疑,逾期将依法不予受理,'''
     text = codecs.open("C:\\Users\\Administrator\\Desktop\\test12354.txt", "r", encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div", id="pcontent"))
+    from BiddingKG.dl.interface.Preprocessing import tableToText
+    # print("tableToText:",tableToText(BeautifulSoup(re.sub("<html>|</html>|<body>|</body>","",content),"lxml")))
 #     text = '''
 # 采购代理机构:山东立行建设项目管理有限公司地址:山东省临沂市兰山县(区)柳青广州路与蒙河路交汇大官苑社区西沿街A区三楼南侧号,联系方式:17862288900,
 # '''