3 éve · 623a06c437
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -1078,9 +1078,11 @@ def segment(soup,final=True):
 
				     text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])","，",text)
			
 
				     #替换为中文分号
			
 
				     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])","；",text)
			
 
				+    # 感叹号替换为中文句号
			
 
				+    text = re.sub("(?<=[\u4e00-\u9fa5])[!！]|[!！](?=[\u4e00-\u9fa5])","。",text)
			
 
				     #替换"？"为 " " ,update:2021/7/20
			
 
				     text = re.sub("？"," ",text)
			
 
				-         
			
 
				+
			
 
				 
			
 
				     #替换"""为"“",否则导入deepdive出错
			
 
				     text = text.replace('"',"“").replace("\r","").replace("\n","，")
			
@@ -1925,10 +1927,11 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				 
			
 
				             # "联系人"正则补充提取  2021/11/15 新增
			
 
				             list_person_text = [entity.entity_text for entity in list_sentence_entitys if entity.entity_type=='person']
			
 
				-            error_text = ['传真','网址','电子邮','联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理','代理人','采购','附件']
			
 
				+            error_text = ['交易','机构','教育','项目','公司','中标','开标','截标','监督','政府','国家','中国','技术','投标','传真','网址','电子邮',
			
 
				+                          '联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理','代理人','采购','附件']
			
 
				             list_person_text = set(list_person_text + error_text)
			
 
				-            re_person = re.compile("联系人[:：]([\u4e00-\u9fa5]{2,3})(?=联系)|"
			
 
				-                                   "联系人[:：]([\u4e00-\u9fa5]工)|"
			
 
				+            re_person = re.compile("联系人[:：]([\u4e00-\u9fa5]工)|"
			
 
				+                                   "联系人[:：]([\u4e00-\u9fa5]{2,3})(?=联系)|"
			
 
				                                    "联系人[:：]([\u4e00-\u9fa5]{2,3})")
			
 
				             list_person = []
			
 
				             for match_result in re_person.finditer(sentence_text):
			
@@ -1937,6 +1940,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                 wordOffset_begin = match_result.start() + 4
			
 
				                 wordOffset_end = match_result.end()
			
 
				                 # print(text[wordOffset_begin:wordOffset_end])
			
 
				+                # 排除一些不为人名的实体
			
 
				+                if re.search("^[\u4e00-\u9fa5]{7,}([，。]|$)",sentence_text[wordOffset_begin:wordOffset_begin+20]):
			
 
				+                    continue
			
 
				                 if entity_text not in list_person_text and entity_text[:2] not in list_person_text:
			
 
				                     _person = dict()
			
 
				                     _person['body'] = entity_text
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -1093,7 +1093,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				         return [(main_roles[row], attributes[col]) for row, col in max_dispatch]
			
 
				 
			
 
				     # 正则提取电话号码实体
			
 
				-    key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
			
 
				+    # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
			
 
				     phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—－―]?\d{4}[-—－―]?\d{4}|'
			
 
				                        '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
			
 
				                        # '0[^0]\d{1,2}[-—－―][1-9]\d{6,7}/[1-9]\d{6,10}|'
			
@@ -1118,8 +1118,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				         res_set = set()
			
 
				         for i in re.finditer(phone, sentence_text):
			
 
				             res_set.add((i.group(), i.start(), i.end()))
			
 
				-        # for i in re.finditer(key_word, sentence_text):
			
 
				-        #     res_set.add((i.group(2), i.start() + len(i.group(1)), i.end()))
			
 
				         res_set = sorted(list(res_set),key=lambda x:x[1])
			
 
				         last_phone_mask = True
			
 
				         for item_idx in range(len(res_set)):
			
@@ -1131,10 +1129,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				                 if not re.search("电，?话", phone_left):
			
 
				                     last_phone_mask = False
			
 
				                     continue
			
 
				-            if re.search("帐，?号|编，?号|报，?价|证，?号|价，?格|[\(（]万?元[\)）]", phone_left):
			
 
				+            if re.search("注册[证号]|帐，?号|编，?[号码]|报，?价|证，?号|价，?格|[\(\（]万?元[\)\）]|[a-zA-Z]+\d*$", phone_left):
			
 
				                 last_phone_mask = False
			
 
				                 continue
			
 
				-            if re.search("^\d{0,4}[.,]\d{2,}", phone_right):
			
 
				+            if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+", phone_right):
			
 
				                 last_phone_mask = False
			
 
				                 continue
			
 
				             # if:上一个phone实体不符合条件
			
@@ -1742,6 +1740,73 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				         for _item in remove_list:
			
 
				             PackDict["Project"]["roleList"][i].linklist.remove(_item)
			
 
				 
			
 
				+    # 联系人——电子邮箱链接
			
 
				+    temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]
			
 
				+    temporary_list3 = sorted(temporary_list3, key=lambda x: (x.sentence_index, x.begin_index))
			
 
				+    new_temporary_list3 = []
			
 
				+    for _split in new_split_list:
			
 
				+        temp_list = []
			
 
				+        for _entity in temporary_list3:
			
 
				+            if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
			
 
				+                _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
			
 
				+                temp_list.append(_entity)
			
 
				+            elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
			
 
				+                break
			
 
				+        new_temporary_list3.append(temp_list)
			
 
				+    # print(new_temporary_list3)
			
 
				+    match_list3 = []
			
 
				+    for split_index in range(len(new_temporary_list3)):
			
 
				+        split_entitys = new_temporary_list3[split_index]
			
 
				+        for index in range(len(split_entitys)):
			
 
				+            entity = split_entitys[index]
			
 
				+            if entity.entity_type == 'person':
			
 
				+                match_nums = 0
			
 
				+                for after_index in range(index + 1, min(len(split_entitys), index + 4)):
			
 
				+                    after_entity = split_entitys[after_index]
			
 
				+                    if match_nums > 2:
			
 
				+                        break
			
 
				+                    if after_entity.entity_type == 'email':
			
 
				+                        distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
			
 
				+                                           tokens_num_dict[entity.sentence_index] + entity.end_index)
			
 
				+                        sentence_distance = after_entity.sentence_index - entity.sentence_index
			
 
				+                        if sentence_distance == 0:
			
 
				+                            if distance < 100:
			
 
				+                                if (entity.label == 0 and after_entity.label == 1) or (
			
 
				+                                        entity.label == 1 and after_entity.label == 2):
			
 
				+                                    distance = distance / 100
			
 
				+                                value = (-1 / 2 * (distance ** 2)) / 10000
			
 
				+                                match_list3.append(Match(entity, after_entity, value))
			
 
				+                                match_nums += 1
			
 
				+                        else:
			
 
				+                            if distance < 60:
			
 
				+                                if (entity.label == 0 and after_entity.label == 1) or (
			
 
				+                                        entity.label == 1 and after_entity.label == 2):
			
 
				+                                    distance = distance / 100
			
 
				+                                value = (-1 / 2 * (distance ** 2)) / 10000
			
 
				+                                match_list3.append(Match(entity, after_entity, value))
			
 
				+                                match_nums += 1
			
 
				+                # 前向查找匹配
			
 
				+                # if not match_nums:
			
 
				+                if index != 0:
			
 
				+                    previous_entity = split_entitys[index - 1]
			
 
				+                    if previous_entity.entity_type == 'email':
			
 
				+                        if previous_entity.sentence_index == entity.sentence_index:
			
 
				+                            distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
			
 
				+                                    tokens_num_dict[
			
 
				+                                        previous_entity.sentence_index] + previous_entity.end_index)
			
 
				+                            if distance < 30:
			
 
				+                                # 距离相等时，前向添加处罚值
			
 
				+                                # distance += 1
			
 
				+                                # 前向 没有 /10000
			
 
				+                                value = (-1 / 2 * (distance ** 2))
			
 
				+                                match_list3.append(Match(entity, previous_entity, value))
			
 
				+    # print(match_list3)
			
 
				+    # km算法分配求解
			
 
				+    result3 = dispatch(match_list3)
			
 
				+    for match in result3:
			
 
				+        match_person = match[0]
			
 
				+        match_email = match[1]
			
 
				+        match_person.pointer_email = match_email
			
 
				 
			
 
				     # # 1）第一个公司实体的招标人，则看看下一个实体是否为代理人，如果是则联系人错位连接 。2）在同一句中往后找联系人。3）连接不上在整个文章找联系人。
			
 
				     # temp_ent_list = []  # 临时列表，记录0,1角色及3联系人
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -1696,6 +1696,13 @@ class ProductAttributesPredictor():
 
				                         num = '0' + num
			
 
				                     order_begin = "%s-%s-01" % (y, m)
			
 
				                     order_end = "%s-%s-%s" % (y, m, num)
			
 
				+            else:
			
 
				+                y = str(datetime.datetime.now().year)
			
 
				+                num = self.get_monthlen(y, m)
			
 
				+                if len(num) < 2:
			
 
				+                    num = '0' + num
			
 
				+                order_begin = "%s-%s-01" % (y, m)
			
 
				+                order_end = "%s-%s-%s" % (y, m, num)
			
 
				             return order_begin, order_end
			
 
				 
			
 
				         t1 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})月?$', text)
			
--- a/BiddingKG/dl/relation_extraction/model.py
+++ b/BiddingKG/dl/relation_extraction/model.py
@@ -234,7 +234,7 @@ class Relation_extraction():
 
				         # self.word2vec = None
			
 
				         # if self.is_train:
			
 
				         #     self.word2vec = load('words2v_matrix.pkl')
			
 
				-        self.model_path = os.path.dirname(__file__)+'/../relation_extraction/models/my_best_model_oneoutput.weights'
			
 
				+        self.model_path = os.path.dirname(__file__)+'/../relation_extraction/models/my_best_model_oneoutput2.weights'
			
 
				         self.get_model()
			
 
				         if self.model_path:
			
 
				             self.train_model.load_weights(self.model_path)
			
--- a/BiddingKG/dl/relation_extraction/models/my_best_model_oneoutput2.weights
+++ b/BiddingKG/dl/relation_extraction/models/my_best_model_oneoutput2.weights
--- a/BiddingKG/dl/relation_extraction/models2/object_model/saved_model.pb
+++ b/BiddingKG/dl/relation_extraction/models2/object_model/saved_model.pb
--- a/BiddingKG/dl/relation_extraction/models2/object_model/variables/variables.data-00000-of-00001
+++ b/BiddingKG/dl/relation_extraction/models2/object_model/variables/variables.data-00000-of-00001
--- a/BiddingKG/dl/relation_extraction/models2/object_model/variables/variables.index
+++ b/BiddingKG/dl/relation_extraction/models2/object_model/variables/variables.index
--- a/BiddingKG/dl/relation_extraction/models2/subject_model/saved_model.pb
+++ b/BiddingKG/dl/relation_extraction/models2/subject_model/saved_model.pb
--- a/BiddingKG/dl/relation_extraction/models2/subject_model/variables/variables.data-00000-of-00001
+++ b/BiddingKG/dl/relation_extraction/models2/subject_model/variables/variables.data-00000-of-00001
--- a/BiddingKG/dl/relation_extraction/models2/subject_model/variables/variables.index
+++ b/BiddingKG/dl/relation_extraction/models2/subject_model/variables/variables.index
--- a/BiddingKG/dl/test/测试整个要素提取流程.py
+++ b/BiddingKG/dl/test/测试整个要素提取流程.py
@@ -121,8 +121,10 @@ def predict(doc_id,text):
 
				         for entity in entitys:
			
 
				             # print('**********实体信息****************')
			
 
				             if entity.entity_type=='person':
			
 
				-                print("联系方式：",end=' ')
			
 
				+                print("联系人-电话：",end=' ')
			
 
				                 print(entity.entity_text,[i.entity_text for i in entity.person_phone] if entity.person_phone else None,entity.label,entity.values)
			
 
				+                if entity.pointer_email:
			
 
				+                    print("联系人-邮箱：",entity.entity_text,entity.pointer_email.entity_text)
			
 
				                 # print(entity.begin_index, entity.end_index)
			
 
				                 print(entity.sentence_index)
			
 
				                 pass
			
@@ -425,6 +427,8 @@ if __name__=="__main__":
 
				     # 传真：0769-81216222，邮编：523000。(三)，采购人：东莞市道滘镇教育管理中心，地址：广东省东莞市道滘镇花园街1号，联系人：李先生，联系电话：0769-81332303，传真：/，邮编：523000，各有关当事人对中标、成交结果有异议的，可以在中标、成交公告发布之日起7个工作日内以书面形式向(政府采购代理机构)(或采购人)提出质疑，逾期将依法不予受理，'''
			
 
				     text = codecs.open("C:\\Users\\Administrator\\Desktop\\test12354.txt", "r", encoding="utf8").read()
			
 
				     content = str(BeautifulSoup(text).find("div", id="pcontent"))
			
 
				+    from BiddingKG.dl.interface.Preprocessing import tableToText
			
 
				+    # print("tableToText:",tableToText(BeautifulSoup(re.sub("<html>|</html>|<body>|</body>","",content),"lxml")))
			
 
				 #     text = '''
			
 
				 # 采购代理机构：山东立行建设项目管理有限公司地址：山东省临沂市兰山县(区)柳青广州路与蒙河路交汇大官苑社区西沿街A区三楼南侧号，联系方式：17862288900，
			
 
				 # '''