Переглянути джерело

公司-中标金额链接更新,字词转换方法修正

znj 3 роки тому
батько
коміт
f2bb505942

+ 1 - 1
BiddingKG/dl/common/Utils.py

@@ -297,7 +297,7 @@ def changeIndexFromWordToWords(tokens,word_index):
     after_index = 0
     for i in range(len(tokens)):
         after_index = after_index+len(tokens[i])
-        if before_index<=word_index and after_index>=word_index:
+        if before_index<=word_index and after_index>word_index:
             return i
         before_index = after_index
         

+ 19 - 16
BiddingKG/dl/entityLink/entityLink.py

@@ -204,6 +204,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                 break
         for p_sentence in list_sentence:
             sentence = p_sentence.sentence_text
+            sentence_entitys = [(ent.entity_text,ent.wordOffset_begin,ent.wordOffset_end) for ent in list_entity if ent.sentence_index==p_sentence.sentence_index and ent.entity_type in ['org','company']]
             list_match = match_enterprise_max_first(sentence)
             # print("list_match", list_match)
 
@@ -246,7 +247,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                             match_replace = True
                             match_add = True
                             begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
-                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
+                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                             list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
                             p_entity.entity_text = _match["entity_text"]
                             p_entity.wordOffset_begin = _match["begin_index"]
@@ -261,7 +262,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                                 entity_type = "company"
 
                                 begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
-                                end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"])
+                                end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]-1)
                                 entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                                 add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
                                 add_entity.if_dict_match = 1
@@ -285,7 +286,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                                 else:
                                     match_replace = True
                                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
-                                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
+                                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                                     list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
                                     p_entity.entity_text = _match["entity_text"]
                                     p_entity.wordOffset_begin = _match["begin_index"]
@@ -294,23 +295,25 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                                     p_entity.end_index = end_index
                                     p_entity.if_dict_match = 1
                         elif _match["end_index"]>=p_entity.wordOffset_end:
-                            match_replace = True
-                            begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
-                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
-                            list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
-                            p_entity.entity_text = _match["entity_text"]
-                            p_entity.wordOffset_begin = _match["begin_index"]
-                            p_entity.wordOffset_end = _match["end_index"]
-                            p_entity.begin_index = begin_index
-                            p_entity.end_index = end_index
-                            p_entity.entity_type = "company"
-                            p_entity.if_dict_match = 1
+                            # 原entity列表已有实体,则不重复添加
+                            if (_match["entity_text"],_match["begin_index"],_match["end_index"]) not in sentence_entitys:
+                                match_replace = True
+                                begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
+                                end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
+                                list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
+                                p_entity.entity_text = _match["entity_text"]
+                                p_entity.wordOffset_begin = _match["begin_index"]
+                                p_entity.wordOffset_end = _match["end_index"]
+                                p_entity.begin_index = begin_index
+                                p_entity.end_index = end_index
+                                p_entity.entity_type = "company"
+                                p_entity.if_dict_match = 1
                     elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
                         find_flag = True
                         if p_entity.entity_type in ("org","company"):
                             match_replace = True
                             begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
-                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
+                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                             list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
                             p_entity.entity_text = _match["entity_text"]
                             p_entity.wordOffset_begin = _match["begin_index"]
@@ -324,7 +327,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                     entity_type = "company"
 
                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
-                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
+                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                     add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"])
                     list_entity.append(add_entity)

+ 2 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -1984,7 +1984,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             # "联系人"正则补充提取  2021/11/15 新增
             list_person_text = [entity.entity_text for entity in list_sentence_entitys if entity.entity_type=='person']
             error_text = ['交易','机构','教育','项目','公司','中标','开标','截标','监督','政府','国家','中国','技术','投标','传真','网址','电子邮',
-                          '联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理','代理人','采购','附件']
+                          '联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理',
+                          '代理人','采购','附件','注意','登录','报名','踏勘']
             list_person_text = set(list_person_text + error_text)
             re_person = re.compile("联系人[::]([\u4e00-\u9fa5]工)|"
                                    "联系人[::]([\u4e00-\u9fa5]{2,3})(?=联系)|"

+ 17 - 5
BiddingKG/dl/interface/getAttributes.py

@@ -1026,7 +1026,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
         temp_entity_list = []
         if link_attribute=="money":
             temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
-                                (ent.entity_type=='money' and ent.label==1)]
+                                (ent.entity_type=='money' and ent.label==1 and ent.values[ent.label]>=0.5)]
             # 删除重复的‘中投标金额’,一般为大小写两种样式
             drop_tendererMoney = []
             for ent_idx in range(len(temp_entity_list)-1):
@@ -1698,7 +1698,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                                                 match_nums += 1
                         # 实体无匹配时,尝试前向查找匹配
                         if not match_nums:
-                            if entity.label != 5 and entity.values[entity.label] > 0.5 and index != 0:
+                            if (entity.label != 5 or entity.entity_text in roleSet) and entity.values[entity.label] >= 0.5 and index != 0:
                                 previous_entity = split_entitys[index - 1]
                                 if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]:
                                     if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
@@ -2255,14 +2255,26 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                     PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
                     # print('通过其他中标人投标金额校正中标金额')
 
+    for item in list_pop:
+        PackDict.pop(item)
+
+    # 公告中只有"招标人"且无"联系人"链接时,直接取文中倒数第一个联系人
+    if len(PackDict)==1:
+        k = list(PackDict.keys())[0]
+        if len(PackDict[k]["roleList"])==1:
+            if PackDict[k]["roleList"][0].role_name == "tenderee":
+                if not PackDict[k]["roleList"][0].linklist:
+                    for _entity in temporary_list2[::-1]:
+                        if _entity.entity_type=='person' and _entity.label==3 and len(_entity.person_phone)>0:
+                            _phone = [p.entity_text for p in _entity.person_phone]
+                            for _p in _phone:
+                                PackDict[k]["roleList"][0].linklist.append((_entity.entity_text, _p))
+                            break
 
     for pack in PackDict.keys():
         for i in range(len(PackDict[pack]["roleList"])):
             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
 
-    for item in list_pop:
-        PackDict.pop(item)
-        
     return PackDict 
 
 def initPackageAttr(RoleList,PackageSet):

+ 6 - 1
BiddingKG/dl/metrics/extractMetric.py

@@ -297,6 +297,11 @@ class ExtractMetric():
             _dict[k] = len(v)
             _dict[k_other] = len(dict_project.get(k_other,[]))
             _dict["%s_union"%base_key] = len(set(v)&set(dict_project.get(k_other,[])))
+            # if base_key in ['second_tenderer_person','third_tenderer_person']:
+            #     print(base_key,"++++++++++++++++++++++++++++++++++++")
+            #     print(set(v))
+            #     print(set(dict_project.get(k_other,[])))
+
             set_k.add(base_key)
         print("=========================")
         print(_inter)
@@ -310,7 +315,7 @@ class ExtractMetric():
 
     def getMetrics(self,list_diff):
         dict_key_count = {}
-        print("all_count:",list_diff)
+        # print("all_count:",list_diff)
         for _diff in list_diff:
             for k,v in _diff.items():
                 if k not in dict_key_count:

+ 2 - 1
BiddingKG/dl/test/test4.py

@@ -71,7 +71,8 @@ if __name__=="__main__":
     # '''
     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
     # print(predict("12", content,"打印机"))
-    print(predict("12", text,"打印机"))
+    # print(predict("12", text,"打印机"))
     # test(12,content)
+    test(12,text)
     print("takes",time.time()-_time1)
     pass