3 роки тому · f2bb505942
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -297,7 +297,7 @@ def changeIndexFromWordToWords(tokens,word_index):
 
															     after_index = 0
														
 
															     for i in range(len(tokens)):
														
 
															         after_index = after_index+len(tokens[i])
														
 
															-        if before_index<=word_index and after_index>=word_index:
														
 
															+        if before_index<=word_index and after_index>word_index:
														
 
															             return i
														
 
															         before_index = after_index
														
--- a/BiddingKG/dl/entityLink/entityLink.py
+++ b/BiddingKG/dl/entityLink/entityLink.py
@@ -204,6 +204,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
															                 break
														
 
															         for p_sentence in list_sentence:
														
 
															             sentence = p_sentence.sentence_text
														
 
															+            sentence_entitys = [(ent.entity_text,ent.wordOffset_begin,ent.wordOffset_end) for ent in list_entity if ent.sentence_index==p_sentence.sentence_index and ent.entity_type in ['org','company']]
														
 
															             list_match = match_enterprise_max_first(sentence)
														
 
															             # print("list_match", list_match)
														
@@ -246,7 +247,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
															                             match_replace = True
														
 
															                             match_add = True
														
 
															                             begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
														
 
															-                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
														
 
															+                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
														
 
															                             list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
														
 
															                             p_entity.entity_text = _match["entity_text"]
														
 
															                             p_entity.wordOffset_begin = _match["begin_index"]
														
@@ -261,7 +262,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
															                                 entity_type = "company"
														
 
															                                 begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
														
 
															-                                end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"])
														
 
															+                                end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]-1)
														
 
															                                 entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
														
 
															                                 add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
														
 
															                                 add_entity.if_dict_match = 1
														
@@ -285,7 +286,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
															                                 else:
														
 
															                                     match_replace = True
														
 
															                                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
														
 
															-                                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
														
 
															+                                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
														
 
															                                     list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
														
 
															                                     p_entity.entity_text = _match["entity_text"]
														
 
															                                     p_entity.wordOffset_begin = _match["begin_index"]
														
@@ -294,23 +295,25 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
															                                     p_entity.end_index = end_index
														
 
															                                     p_entity.if_dict_match = 1
														
 
															                         elif _match["end_index"]>=p_entity.wordOffset_end:
														
 
															-                            match_replace = True
														
 
															-                            begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
														
 
															-                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
														
 
															-                            list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
														
 
															-                            p_entity.entity_text = _match["entity_text"]
														
 
															-                            p_entity.wordOffset_begin = _match["begin_index"]
														
 
															-                            p_entity.wordOffset_end = _match["end_index"]
														
 
															-                            p_entity.begin_index = begin_index
														
 
															-                            p_entity.end_index = end_index
														
 
															-                            p_entity.entity_type = "company"
														
 
															-                            p_entity.if_dict_match = 1
														
 
															+                            # 原entity列表已有实体，则不重复添加
														
 
															+                            if (_match["entity_text"],_match["begin_index"],_match["end_index"]) not in sentence_entitys:
														
 
															+                                match_replace = True
														
 
															+                                begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
														
 
															+                                end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
														
 
															+                                list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
														
 
															+                                p_entity.entity_text = _match["entity_text"]
														
 
															+                                p_entity.wordOffset_begin = _match["begin_index"]
														
 
															+                                p_entity.wordOffset_end = _match["end_index"]
														
 
															+                                p_entity.begin_index = begin_index
														
 
															+                                p_entity.end_index = end_index
														
 
															+                                p_entity.entity_type = "company"
														
 
															+                                p_entity.if_dict_match = 1
														
 
															                     elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
														
 
															                         find_flag = True
														
 
															                         if p_entity.entity_type in ("org","company"):
														
 
															                             match_replace = True
														
 
															                             begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
														
 
															-                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
														
 
															+                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
														
 
															                             list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
														
 
															                             p_entity.entity_text = _match["entity_text"]
														
 
															                             p_entity.wordOffset_begin = _match["begin_index"]
														
@@ -324,7 +327,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
															                     entity_type = "company"
														
 
															                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
														
 
															-                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
														
 
															+                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
														
 
															                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
														
 
															                     add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"])
														
 
															                     list_entity.append(add_entity)
														
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -1984,7 +1984,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															             # "联系人"正则补充提取  2021/11/15 新增
														
 
															             list_person_text = [entity.entity_text for entity in list_sentence_entitys if entity.entity_type=='person']
														
 
															             error_text = ['交易','机构','教育','项目','公司','中标','开标','截标','监督','政府','国家','中国','技术','投标','传真','网址','电子邮',
														
 
															-                          '联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理','代理人','采购','附件']
														
 
															+                          '联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理',
														
 
															+                          '代理人','采购','附件','注意','登录','报名','踏勘']
														
 
															             list_person_text = set(list_person_text + error_text)
														
 
															             re_person = re.compile("联系人[:：]([\u4e00-\u9fa5]工)|"
														
 
															                                    "联系人[:：]([\u4e00-\u9fa5]{2,3})(?=联系)|"
														
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -1026,7 +1026,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															         temp_entity_list = []
														
 
															         if link_attribute=="money":
														
 
															             temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
														
 
															-                                (ent.entity_type=='money' and ent.label==1)]
														
 
															+                                (ent.entity_type=='money' and ent.label==1 and ent.values[ent.label]>=0.5)]
														
 
															             # 删除重复的‘中投标金额’，一般为大小写两种样式
														
 
															             drop_tendererMoney = []
														
 
															             for ent_idx in range(len(temp_entity_list)-1):
														
@@ -1698,7 +1698,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                                                 match_nums += 1
														
 
															                         # 实体无匹配时，尝试前向查找匹配
														
 
															                         if not match_nums:
														
 
															-                            if entity.label != 5 and entity.values[entity.label] > 0.5 and index != 0:
														
 
															+                            if (entity.label != 5 or entity.entity_text in roleSet) and entity.values[entity.label] >= 0.5 and index != 0:
														
 
															                                 previous_entity = split_entitys[index - 1]
														
 
															                                 if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]:
														
 
															                                     if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
														
@@ -2255,14 +2255,26 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                     PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
														
 
															                     # print('通过其他中标人投标金额校正中标金额')
														
 
															+    for item in list_pop:
														
 
															+        PackDict.pop(item)
														
 
															+
														
 
															+    # 公告中只有"招标人"且无"联系人"链接时，直接取文中倒数第一个联系人
														
 
															+    if len(PackDict)==1:
														
 
															+        k = list(PackDict.keys())[0]
														
 
															+        if len(PackDict[k]["roleList"])==1:
														
 
															+            if PackDict[k]["roleList"][0].role_name == "tenderee":
														
 
															+                if not PackDict[k]["roleList"][0].linklist:
														
 
															+                    for _entity in temporary_list2[::-1]:
														
 
															+                        if _entity.entity_type=='person' and _entity.label==3 and len(_entity.person_phone)>0:
														
 
															+                            _phone = [p.entity_text for p in _entity.person_phone]
														
 
															+                            for _p in _phone:
														
 
															+                                PackDict[k]["roleList"][0].linklist.append((_entity.entity_text, _p))
														
 
															+                            break
														
 
															     for pack in PackDict.keys():
														
 
															         for i in range(len(PackDict[pack]["roleList"])):
														
 
															             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
														
 
															-    for item in list_pop:
														
 
															-        PackDict.pop(item)
														
 
															-        
														
 
															     return PackDict 
														
 
															 def initPackageAttr(RoleList,PackageSet):
														
--- a/BiddingKG/dl/metrics/extractMetric.py
+++ b/BiddingKG/dl/metrics/extractMetric.py
@@ -297,6 +297,11 @@ class ExtractMetric():
 
															             _dict[k] = len(v)
														
 
															             _dict[k_other] = len(dict_project.get(k_other,[]))
														
 
															             _dict["%s_union"%base_key] = len(set(v)&set(dict_project.get(k_other,[])))
														
 
															+            # if base_key in ['second_tenderer_person','third_tenderer_person']:
														
 
															+            #     print(base_key,"++++++++++++++++++++++++++++++++++++")
														
 
															+            #     print(set(v))
														
 
															+            #     print(set(dict_project.get(k_other,[])))
														
 
															+
														
 
															             set_k.add(base_key)
														
 
															         print("=========================")
														
 
															         print(_inter)
														
@@ -310,7 +315,7 @@ class ExtractMetric():
 
															     def getMetrics(self,list_diff):
														
 
															         dict_key_count = {}
														
 
															-        print("all_count:",list_diff)
														
 
															+        # print("all_count:",list_diff)
														
 
															         for _diff in list_diff:
														
 
															             for k,v in _diff.items():
														
 
															                 if k not in dict_key_count:
														
--- a/BiddingKG/dl/test/test4.py
+++ b/BiddingKG/dl/test/test4.py
@@ -71,7 +71,8 @@ if __name__=="__main__":
 
															     # '''
														
 
															     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
														
 
															     # print(predict("12", content,"打印机"))
														
 
															-    print(predict("12", text,"打印机"))
														
 
															+    # print(predict("12", text,"打印机"))
														
 
															     # test(12,content)
														
 
															+    test(12,text)
														
 
															     print("takes",time.time()-_time1)
														
 
															     pass