Forráskód Böngészése

Merge branch 'master' of http://192.168.2.65:3000/BIDI-ML/BIDI_ML_INFO_EXTRACTION into outside

 Conflicts:
	BiddingKG/dl/test/test4.py
luojiehua 3 éve
szülő
commit
623a06c437

+ 10 - 4
BiddingKG/dl/interface/Preprocessing.py

@@ -1078,9 +1078,11 @@ def segment(soup,final=True):
     text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
     #替换为中文分号
     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
+    # 感叹号替换为中文句号
+    text = re.sub("(?<=[\u4e00-\u9fa5])[!!]|[!!](?=[\u4e00-\u9fa5])","。",text)
     #替换"?"为 " " ,update:2021/7/20
     text = re.sub("?"," ",text)
-         
+
 
     #替换"""为"“",否则导入deepdive出错
     text = text.replace('"',"“").replace("\r","").replace("\n",",")
@@ -1925,10 +1927,11 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
             # "联系人"正则补充提取  2021/11/15 新增
             list_person_text = [entity.entity_text for entity in list_sentence_entitys if entity.entity_type=='person']
-            error_text = ['传真','网址','电子邮','联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理','代理人','采购','附件']
+            error_text = ['交易','机构','教育','项目','公司','中标','开标','截标','监督','政府','国家','中国','技术','投标','传真','网址','电子邮',
+                          '联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理','代理人','采购','附件']
             list_person_text = set(list_person_text + error_text)
-            re_person = re.compile("联系人[::]([\u4e00-\u9fa5]{2,3})(?=联系)|"
-                                   "联系人[::]([\u4e00-\u9fa5])|"
+            re_person = re.compile("联系人[::]([\u4e00-\u9fa5])|"
+                                   "联系人[::]([\u4e00-\u9fa5]{2,3})(?=联系)|"
                                    "联系人[::]([\u4e00-\u9fa5]{2,3})")
             list_person = []
             for match_result in re_person.finditer(sentence_text):
@@ -1937,6 +1940,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 wordOffset_begin = match_result.start() + 4
                 wordOffset_end = match_result.end()
                 # print(text[wordOffset_begin:wordOffset_end])
+                # 排除一些不为人名的实体
+                if re.search("^[\u4e00-\u9fa5]{7,}([,。]|$)",sentence_text[wordOffset_begin:wordOffset_begin+20]):
+                    continue
                 if entity_text not in list_person_text and entity_text[:2] not in list_person_text:
                     _person = dict()
                     _person['body'] = entity_text

+ 70 - 5
BiddingKG/dl/interface/getAttributes.py

@@ -1093,7 +1093,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
         return [(main_roles[row], attributes[col]) for row, col in max_dispatch]
 
     # 正则提取电话号码实体
-    key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
+    # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
     phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
                        '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
                        # '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
@@ -1118,8 +1118,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
         res_set = set()
         for i in re.finditer(phone, sentence_text):
             res_set.add((i.group(), i.start(), i.end()))
-        # for i in re.finditer(key_word, sentence_text):
-        #     res_set.add((i.group(2), i.start() + len(i.group(1)), i.end()))
         res_set = sorted(list(res_set),key=lambda x:x[1])
         last_phone_mask = True
         for item_idx in range(len(res_set)):
@@ -1131,10 +1129,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                 if not re.search("电,?话", phone_left):
                     last_phone_mask = False
                     continue
-            if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]", phone_left):
+            if re.search("注册[证号]|帐,?号|编,?[码]|报,?价|证,?号|价,?格|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", phone_left):
                 last_phone_mask = False
                 continue
-            if re.search("^\d{0,4}[.,]\d{2,}", phone_right):
+            if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+", phone_right):
                 last_phone_mask = False
                 continue
             # if:上一个phone实体不符合条件
@@ -1742,6 +1740,73 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
         for _item in remove_list:
             PackDict["Project"]["roleList"][i].linklist.remove(_item)
 
+    # 联系人——电子邮箱链接
+    temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]
+    temporary_list3 = sorted(temporary_list3, key=lambda x: (x.sentence_index, x.begin_index))
+    new_temporary_list3 = []
+    for _split in new_split_list:
+        temp_list = []
+        for _entity in temporary_list3:
+            if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
+                _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
+                temp_list.append(_entity)
+            elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
+                break
+        new_temporary_list3.append(temp_list)
+    # print(new_temporary_list3)
+    match_list3 = []
+    for split_index in range(len(new_temporary_list3)):
+        split_entitys = new_temporary_list3[split_index]
+        for index in range(len(split_entitys)):
+            entity = split_entitys[index]
+            if entity.entity_type == 'person':
+                match_nums = 0
+                for after_index in range(index + 1, min(len(split_entitys), index + 4)):
+                    after_entity = split_entitys[after_index]
+                    if match_nums > 2:
+                        break
+                    if after_entity.entity_type == 'email':
+                        distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
+                                           tokens_num_dict[entity.sentence_index] + entity.end_index)
+                        sentence_distance = after_entity.sentence_index - entity.sentence_index
+                        if sentence_distance == 0:
+                            if distance < 100:
+                                if (entity.label == 0 and after_entity.label == 1) or (
+                                        entity.label == 1 and after_entity.label == 2):
+                                    distance = distance / 100
+                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                match_list3.append(Match(entity, after_entity, value))
+                                match_nums += 1
+                        else:
+                            if distance < 60:
+                                if (entity.label == 0 and after_entity.label == 1) or (
+                                        entity.label == 1 and after_entity.label == 2):
+                                    distance = distance / 100
+                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                match_list3.append(Match(entity, after_entity, value))
+                                match_nums += 1
+                # 前向查找匹配
+                # if not match_nums:
+                if index != 0:
+                    previous_entity = split_entitys[index - 1]
+                    if previous_entity.entity_type == 'email':
+                        if previous_entity.sentence_index == entity.sentence_index:
+                            distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
+                                    tokens_num_dict[
+                                        previous_entity.sentence_index] + previous_entity.end_index)
+                            if distance < 30:
+                                # 距离相等时,前向添加处罚值
+                                # distance += 1
+                                # 前向 没有 /10000
+                                value = (-1 / 2 * (distance ** 2))
+                                match_list3.append(Match(entity, previous_entity, value))
+    # print(match_list3)
+    # km算法分配求解
+    result3 = dispatch(match_list3)
+    for match in result3:
+        match_person = match[0]
+        match_email = match[1]
+        match_person.pointer_email = match_email
 
     # # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
     # temp_ent_list = []  # 临时列表,记录0,1角色及3联系人

+ 7 - 0
BiddingKG/dl/interface/predictor.py

@@ -1696,6 +1696,13 @@ class ProductAttributesPredictor():
                         num = '0' + num
                     order_begin = "%s-%s-01" % (y, m)
                     order_end = "%s-%s-%s" % (y, m, num)
+            else:
+                y = str(datetime.datetime.now().year)
+                num = self.get_monthlen(y, m)
+                if len(num) < 2:
+                    num = '0' + num
+                order_begin = "%s-%s-01" % (y, m)
+                order_end = "%s-%s-%s" % (y, m, num)
             return order_begin, order_end
 
         t1 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})月?$', text)

+ 1 - 1
BiddingKG/dl/relation_extraction/model.py

@@ -234,7 +234,7 @@ class Relation_extraction():
         # self.word2vec = None
         # if self.is_train:
         #     self.word2vec = load('words2v_matrix.pkl')
-        self.model_path = os.path.dirname(__file__)+'/../relation_extraction/models/my_best_model_oneoutput.weights'
+        self.model_path = os.path.dirname(__file__)+'/../relation_extraction/models/my_best_model_oneoutput2.weights'
         self.get_model()
         if self.model_path:
             self.train_model.load_weights(self.model_path)

BIN
BiddingKG/dl/relation_extraction/models/my_best_model_oneoutput2.weights


BIN
BiddingKG/dl/relation_extraction/models2/object_model/saved_model.pb


BIN
BiddingKG/dl/relation_extraction/models2/object_model/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/relation_extraction/models2/object_model/variables/variables.index


BIN
BiddingKG/dl/relation_extraction/models2/subject_model/saved_model.pb


BIN
BiddingKG/dl/relation_extraction/models2/subject_model/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/relation_extraction/models2/subject_model/variables/variables.index


+ 5 - 1
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -121,8 +121,10 @@ def predict(doc_id,text):
         for entity in entitys:
             # print('**********实体信息****************')
             if entity.entity_type=='person':
-                print("联系方式:",end=' ')
+                print("联系人-电话:",end=' ')
                 print(entity.entity_text,[i.entity_text for i in entity.person_phone] if entity.person_phone else None,entity.label,entity.values)
+                if entity.pointer_email:
+                    print("联系人-邮箱:",entity.entity_text,entity.pointer_email.entity_text)
                 # print(entity.begin_index, entity.end_index)
                 print(entity.sentence_index)
                 pass
@@ -425,6 +427,8 @@ if __name__=="__main__":
     # 传真:0769-81216222,邮编:523000。(三),采购人:东莞市道滘镇教育管理中心,地址:广东省东莞市道滘镇花园街1号,联系人:李先生,联系电话:0769-81332303,传真:/,邮编:523000,各有关当事人对中标、成交结果有异议的,可以在中标、成交公告发布之日起7个工作日内以书面形式向(政府采购代理机构)(或采购人)提出质疑,逾期将依法不予受理,'''
     text = codecs.open("C:\\Users\\Administrator\\Desktop\\test12354.txt", "r", encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div", id="pcontent"))
+    from BiddingKG.dl.interface.Preprocessing import tableToText
+    # print("tableToText:",tableToText(BeautifulSoup(re.sub("<html>|</html>|<body>|</body>","",content),"lxml")))
 #     text = '''
 # 采购代理机构:山东立行建设项目管理有限公司地址:山东省临沂市兰山县(区)柳青广州路与蒙河路交汇大官苑社区西沿街A区三楼南侧号,联系方式:17862288900,
 # '''