Browse Source

关系链接模型优化

znj 3 years ago
parent
commit
32fc2c0d10

+ 68 - 3
BiddingKG/dl/interface/getAttributes.py

@@ -1093,7 +1093,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
         return [(main_roles[row], attributes[col]) for row, col in max_dispatch]
 
     # 正则提取电话号码实体
-    key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
+    # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
     phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
                        '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
                        # '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
@@ -1118,8 +1118,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
         res_set = set()
         for i in re.finditer(phone, sentence_text):
             res_set.add((i.group(), i.start(), i.end()))
-        # for i in re.finditer(key_word, sentence_text):
-        #     res_set.add((i.group(2), i.start() + len(i.group(1)), i.end()))
         res_set = sorted(list(res_set),key=lambda x:x[1])
         last_phone_mask = True
         for item_idx in range(len(res_set)):
@@ -1742,6 +1740,73 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
         for _item in remove_list:
             PackDict["Project"]["roleList"][i].linklist.remove(_item)
 
+    # 联系人——电子邮箱链接
+    temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]
+    temporary_list3 = sorted(temporary_list3, key=lambda x: (x.sentence_index, x.begin_index))
+    new_temporary_list3 = []
+    for _split in new_split_list:
+        temp_list = []
+        for _entity in temporary_list3:
+            if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
+                _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
+                temp_list.append(_entity)
+            elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
+                break
+        new_temporary_list3.append(temp_list)
+    # print(new_temporary_list3)
+    match_list3 = []
+    for split_index in range(len(new_temporary_list3)):
+        split_entitys = new_temporary_list3[split_index]
+        for index in range(len(split_entitys)):
+            entity = split_entitys[index]
+            if entity.entity_type == 'person':
+                match_nums = 0
+                for after_index in range(index + 1, min(len(split_entitys), index + 4)):
+                    after_entity = split_entitys[after_index]
+                    if match_nums > 2:
+                        break
+                    if after_entity.entity_type == 'email':
+                        distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
+                                           tokens_num_dict[entity.sentence_index] + entity.end_index)
+                        sentence_distance = after_entity.sentence_index - entity.sentence_index
+                        if sentence_distance == 0:
+                            if distance < 100:
+                                if (entity.label == 0 and after_entity.label == 1) or (
+                                        entity.label == 1 and after_entity.label == 2):
+                                    distance = distance / 100
+                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                match_list3.append(Match(entity, after_entity, value))
+                                match_nums += 1
+                        else:
+                            if distance < 60:
+                                if (entity.label == 0 and after_entity.label == 1) or (
+                                        entity.label == 1 and after_entity.label == 2):
+                                    distance = distance / 100
+                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                match_list3.append(Match(entity, after_entity, value))
+                                match_nums += 1
+                # 前向查找匹配
+                # if not match_nums:
+                if index != 0:
+                    previous_entity = split_entitys[index - 1]
+                    if previous_entity.entity_type == 'email':
+                        if previous_entity.sentence_index == entity.sentence_index:
+                            distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
+                                    tokens_num_dict[
+                                        previous_entity.sentence_index] + previous_entity.end_index)
+                            if distance < 30:
+                                # 距离相等时,前向添加处罚值
+                                # distance += 1
+                                # 前向 没有 /10000
+                                value = (-1 / 2 * (distance ** 2))
+                                match_list3.append(Match(entity, previous_entity, value))
+    # print(match_list3)
+    # km算法分配求解
+    result3 = dispatch(match_list3)
+    for match in result3:
+        match_person = match[0]
+        match_email = match[1]
+        match_person.pointer_email = match_email
 
     # # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
     # temp_ent_list = []  # 临时列表,记录0,1角色及3联系人

+ 1 - 1
BiddingKG/dl/relation_extraction/model.py

@@ -234,7 +234,7 @@ class Relation_extraction():
         # self.word2vec = None
         # if self.is_train:
         #     self.word2vec = load('words2v_matrix.pkl')
-        self.model_path = os.path.dirname(__file__)+'/../relation_extraction/models/my_best_model_oneoutput.weights'
+        self.model_path = os.path.dirname(__file__)+'/../relation_extraction/models/my_best_model_oneoutput2.weights'
         self.get_model()
         if self.model_path:
             self.train_model.load_weights(self.model_path)

BIN
BiddingKG/dl/relation_extraction/models/my_best_model_oneoutput2.weights


BIN
BiddingKG/dl/relation_extraction/models2/object_model/saved_model.pb


BIN
BiddingKG/dl/relation_extraction/models2/object_model/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/relation_extraction/models2/object_model/variables/variables.index


BIN
BiddingKG/dl/relation_extraction/models2/subject_model/saved_model.pb


BIN
BiddingKG/dl/relation_extraction/models2/subject_model/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/relation_extraction/models2/subject_model/variables/variables.index


+ 3 - 1
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -121,8 +121,10 @@ def predict(doc_id,text):
         for entity in entitys:
             # print('**********实体信息****************')
             if entity.entity_type=='person':
-                print("联系方式:",end=' ')
+                print("联系人-电话:",end=' ')
                 print(entity.entity_text,[i.entity_text for i in entity.person_phone] if entity.person_phone else None,entity.label,entity.values)
+                if entity.pointer_email:
+                    print("联系人-邮箱:",entity.entity_text,entity.pointer_email.entity_text)
                 # print(entity.begin_index, entity.end_index)
                 print(entity.sentence_index)
                 pass