3 роки тому · f2bb505942
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -297,7 +297,7 @@ def changeIndexFromWordToWords(tokens,word_index):
 
				     after_index = 0
			
 
				     for i in range(len(tokens)):
			
 
				         after_index = after_index+len(tokens[i])
			
 
				-        if before_index<=word_index and after_index>=word_index:
			
 
				+        if before_index<=word_index and after_index>word_index:
			
 
				             return i
			
 
				         before_index = after_index
			
 
				         
			
--- a/BiddingKG/dl/entityLink/entityLink.py
+++ b/BiddingKG/dl/entityLink/entityLink.py
@@ -204,6 +204,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                 break
			
 
				         for p_sentence in list_sentence:
			
 
				             sentence = p_sentence.sentence_text
			
 
				+            sentence_entitys = [(ent.entity_text,ent.wordOffset_begin,ent.wordOffset_end) for ent in list_entity if ent.sentence_index==p_sentence.sentence_index and ent.entity_type in ['org','company']]
			
 
				             list_match = match_enterprise_max_first(sentence)
			
 
				             # print("list_match", list_match)
			
 
				 
			
@@ -246,7 +247,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                             match_replace = True
			
 
				                             match_add = True
			
 
				                             begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
			
 
				-                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
			
 
				+                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
			
 
				                             list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
			
 
				                             p_entity.entity_text = _match["entity_text"]
			
 
				                             p_entity.wordOffset_begin = _match["begin_index"]
			
@@ -261,7 +262,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                                 entity_type = "company"
			
 
				 
			
 
				                                 begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
			
 
				-                                end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"])
			
 
				+                                end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]-1)
			
 
				                                 entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
			
 
				                                 add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
			
 
				                                 add_entity.if_dict_match = 1
			
@@ -285,7 +286,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                                 else:
			
 
				                                     match_replace = True
			
 
				                                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
			
 
				-                                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
			
 
				+                                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
			
 
				                                     list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
			
 
				                                     p_entity.entity_text = _match["entity_text"]
			
 
				                                     p_entity.wordOffset_begin = _match["begin_index"]
			
@@ -294,23 +295,25 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                                     p_entity.end_index = end_index
			
 
				                                     p_entity.if_dict_match = 1
			
 
				                         elif _match["end_index"]>=p_entity.wordOffset_end:
			
 
				-                            match_replace = True
			
 
				-                            begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
			
 
				-                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
			
 
				-                            list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
			
 
				-                            p_entity.entity_text = _match["entity_text"]
			
 
				-                            p_entity.wordOffset_begin = _match["begin_index"]
			
 
				-                            p_entity.wordOffset_end = _match["end_index"]
			
 
				-                            p_entity.begin_index = begin_index
			
 
				-                            p_entity.end_index = end_index
			
 
				-                            p_entity.entity_type = "company"
			
 
				-                            p_entity.if_dict_match = 1
			
 
				+                            # 原entity列表已有实体，则不重复添加
			
 
				+                            if (_match["entity_text"],_match["begin_index"],_match["end_index"]) not in sentence_entitys:
			
 
				+                                match_replace = True
			
 
				+                                begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
			
 
				+                                end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
			
 
				+                                list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
			
 
				+                                p_entity.entity_text = _match["entity_text"]
			
 
				+                                p_entity.wordOffset_begin = _match["begin_index"]
			
 
				+                                p_entity.wordOffset_end = _match["end_index"]
			
 
				+                                p_entity.begin_index = begin_index
			
 
				+                                p_entity.end_index = end_index
			
 
				+                                p_entity.entity_type = "company"
			
 
				+                                p_entity.if_dict_match = 1
			
 
				                     elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
			
 
				                         find_flag = True
			
 
				                         if p_entity.entity_type in ("org","company"):
			
 
				                             match_replace = True
			
 
				                             begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
			
 
				-                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
			
 
				+                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
			
 
				                             list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
			
 
				                             p_entity.entity_text = _match["entity_text"]
			
 
				                             p_entity.wordOffset_begin = _match["begin_index"]
			
@@ -324,7 +327,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                     entity_type = "company"
			
 
				 
			
 
				                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
			
 
				-                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
			
 
				+                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
			
 
				                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
			
 
				                     add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"])
			
 
				                     list_entity.append(add_entity)
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -1984,7 +1984,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				             # "联系人"正则补充提取  2021/11/15 新增
			
 
				             list_person_text = [entity.entity_text for entity in list_sentence_entitys if entity.entity_type=='person']
			
 
				             error_text = ['交易','机构','教育','项目','公司','中标','开标','截标','监督','政府','国家','中国','技术','投标','传真','网址','电子邮',
			
 
				-                          '联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理','代理人','采购','附件']
			
 
				+                          '联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理',
			
 
				+                          '代理人','采购','附件','注意','登录','报名','踏勘']
			
 
				             list_person_text = set(list_person_text + error_text)
			
 
				             re_person = re.compile("联系人[:：]([\u4e00-\u9fa5]工)|"
			
 
				                                    "联系人[:：]([\u4e00-\u9fa5]{2,3})(?=联系)|"
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -1026,7 +1026,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				         temp_entity_list = []
			
 
				         if link_attribute=="money":
			
 
				             temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
			
 
				-                                (ent.entity_type=='money' and ent.label==1)]
			
 
				+                                (ent.entity_type=='money' and ent.label==1 and ent.values[ent.label]>=0.5)]
			
 
				             # 删除重复的‘中投标金额’，一般为大小写两种样式
			
 
				             drop_tendererMoney = []
			
 
				             for ent_idx in range(len(temp_entity_list)-1):
			
@@ -1698,7 +1698,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				                                                 match_nums += 1
			
 
				                         # 实体无匹配时，尝试前向查找匹配
			
 
				                         if not match_nums:
			
 
				-                            if entity.label != 5 and entity.values[entity.label] > 0.5 and index != 0:
			
 
				+                            if (entity.label != 5 or entity.entity_text in roleSet) and entity.values[entity.label] >= 0.5 and index != 0:
			
 
				                                 previous_entity = split_entitys[index - 1]
			
 
				                                 if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]:
			
 
				                                     if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
			
@@ -2255,14 +2255,26 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
				                     PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
			
 
				                     # print('通过其他中标人投标金额校正中标金额')
			
 
				 
			
 
				+    for item in list_pop:
			
 
				+        PackDict.pop(item)
			
 
				+
			
 
				+    # 公告中只有"招标人"且无"联系人"链接时，直接取文中倒数第一个联系人
			
 
				+    if len(PackDict)==1:
			
 
				+        k = list(PackDict.keys())[0]
			
 
				+        if len(PackDict[k]["roleList"])==1:
			
 
				+            if PackDict[k]["roleList"][0].role_name == "tenderee":
			
 
				+                if not PackDict[k]["roleList"][0].linklist:
			
 
				+                    for _entity in temporary_list2[::-1]:
			
 
				+                        if _entity.entity_type=='person' and _entity.label==3 and len(_entity.person_phone)>0:
			
 
				+                            _phone = [p.entity_text for p in _entity.person_phone]
			
 
				+                            for _p in _phone:
			
 
				+                                PackDict[k]["roleList"][0].linklist.append((_entity.entity_text, _p))
			
 
				+                            break
			
 
				 
			
 
				     for pack in PackDict.keys():
			
 
				         for i in range(len(PackDict[pack]["roleList"])):
			
 
				             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
			
 
				 
			
 
				-    for item in list_pop:
			
 
				-        PackDict.pop(item)
			
 
				-        
			
 
				     return PackDict 
			
 
				 
			
 
				 def initPackageAttr(RoleList,PackageSet):
			
--- a/BiddingKG/dl/metrics/extractMetric.py
+++ b/BiddingKG/dl/metrics/extractMetric.py
@@ -297,6 +297,11 @@ class ExtractMetric():
 
				             _dict[k] = len(v)
			
 
				             _dict[k_other] = len(dict_project.get(k_other,[]))
			
 
				             _dict["%s_union"%base_key] = len(set(v)&set(dict_project.get(k_other,[])))
			
 
				+            # if base_key in ['second_tenderer_person','third_tenderer_person']:
			
 
				+            #     print(base_key,"++++++++++++++++++++++++++++++++++++")
			
 
				+            #     print(set(v))
			
 
				+            #     print(set(dict_project.get(k_other,[])))
			
 
				+
			
 
				             set_k.add(base_key)
			
 
				         print("=========================")
			
 
				         print(_inter)
			
@@ -310,7 +315,7 @@ class ExtractMetric():
 
				 
			
 
				     def getMetrics(self,list_diff):
			
 
				         dict_key_count = {}
			
 
				-        print("all_count:",list_diff)
			
 
				+        # print("all_count:",list_diff)
			
 
				         for _diff in list_diff:
			
 
				             for k,v in _diff.items():
			
 
				                 if k not in dict_key_count:
			
--- a/BiddingKG/dl/test/test4.py
+++ b/BiddingKG/dl/test/test4.py
@@ -71,7 +71,8 @@ if __name__=="__main__":
 
				     # '''
			
 
				     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
			
 
				     # print(predict("12", content,"打印机"))
			
 
				-    print(predict("12", text,"打印机"))
			
 
				+    # print(predict("12", text,"打印机"))
			
 
				     # test(12,content)
			
 
				+    test(12,text)
			
 
				     print("takes",time.time()-_time1)
			
 
				     pass