Browse Source

Merge branch 'master' of http://192.168.2.65:3000/BIDI-ML/BIDI_ML_INFO_EXTRACTION

luojiehua 3 năm trước cách đây
mục cha
commit
057e6eb28a

+ 1 - 1
BiddingKG/dl/common/Utils.py

@@ -297,7 +297,7 @@ def changeIndexFromWordToWords(tokens,word_index):
     after_index = 0
     for i in range(len(tokens)):
         after_index = after_index+len(tokens[i])
-        if before_index<=word_index and after_index>=word_index:
+        if before_index<=word_index and after_index>word_index:
             return i
         before_index = after_index
         

+ 19 - 16
BiddingKG/dl/entityLink/entityLink.py

@@ -205,6 +205,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                 break
         for p_sentence in list_sentence:
             sentence = p_sentence.sentence_text
+            sentence_entitys = [(ent.entity_text,ent.wordOffset_begin,ent.wordOffset_end) for ent in list_entity if ent.sentence_index==p_sentence.sentence_index and ent.entity_type in ['org','company']]
             list_match = match_enterprise_max_first(sentence)
             # print("list_match", list_match)
 
@@ -247,7 +248,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                             match_replace = True
                             match_add = True
                             begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
-                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
+                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                             list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
                             p_entity.entity_text = _match["entity_text"]
                             p_entity.wordOffset_begin = _match["begin_index"]
@@ -262,7 +263,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                                 entity_type = "company"
 
                                 begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
-                                end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"])
+                                end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]-1)
                                 entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                                 add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
                                 add_entity.if_dict_match = 1
@@ -286,7 +287,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                                 else:
                                     match_replace = True
                                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
-                                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
+                                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                                     list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
                                     p_entity.entity_text = _match["entity_text"]
                                     p_entity.wordOffset_begin = _match["begin_index"]
@@ -295,23 +296,25 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                                     p_entity.end_index = end_index
                                     p_entity.if_dict_match = 1
                         elif _match["end_index"]>=p_entity.wordOffset_end:
-                            match_replace = True
-                            begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
-                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
-                            list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
-                            p_entity.entity_text = _match["entity_text"]
-                            p_entity.wordOffset_begin = _match["begin_index"]
-                            p_entity.wordOffset_end = _match["end_index"]
-                            p_entity.begin_index = begin_index
-                            p_entity.end_index = end_index
-                            p_entity.entity_type = "company"
-                            p_entity.if_dict_match = 1
+                            # 原entity列表已有实体,则不重复添加
+                            if (_match["entity_text"],_match["begin_index"],_match["end_index"]) not in sentence_entitys:
+                                match_replace = True
+                                begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
+                                end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
+                                list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
+                                p_entity.entity_text = _match["entity_text"]
+                                p_entity.wordOffset_begin = _match["begin_index"]
+                                p_entity.wordOffset_end = _match["end_index"]
+                                p_entity.begin_index = begin_index
+                                p_entity.end_index = end_index
+                                p_entity.entity_type = "company"
+                                p_entity.if_dict_match = 1
                     elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
                         find_flag = True
                         if p_entity.entity_type in ("org","company"):
                             match_replace = True
                             begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
-                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
+                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                             list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
                             p_entity.entity_text = _match["entity_text"]
                             p_entity.wordOffset_begin = _match["begin_index"]
@@ -325,7 +328,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                     entity_type = "company"
 
                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
-                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
+                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                     add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"])
                     list_entity.append(add_entity)

+ 8 - 2
BiddingKG/dl/interface/Entitys.py

@@ -172,6 +172,8 @@ class Entity():
         self.if_dict_match = 0  # 2021/12/21 新增,判断公司实体是否由字典识别得到
         self.is_total_money = 0  # 2021/12/29 新增,判断金额是否总价
         self.is_unit_money = 0  # 2021/12/29 新增,判断金额是否单价
+        self.pointer_serviceTime = None  # 2022/01/05 新增,中标人对应链接"服务期限(工期)"
+        self.pointer_ratio = None  # 2022/01/05 新增,中标人对应链接"中投标金额->费率、下浮率"
         self.origin_entity_text = ''  # 2022/1/5 新增,记录字典替换的原来的实体名
 
     def set_Role(self,role_label,role_values):
@@ -262,11 +264,15 @@ class Role():
         self.money_prob = money_prob
         self.linklist = linklist
         self.money_unit = '' # 2021/8/17 新增 保存金额单位
-        
+        # 中投标人属性
+        self.ratio = "" #2022/01/06 新增 保存中投标金额相关费率
+        self.serviceTime = "" #2021/01/06 新增 保存服务期限(工期)
+
     def getString(self):
         self.linklist = [item for item in set(self.linklist)]
         # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist]
-        result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit]
+        # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit]
+        result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit,self.ratio,self.serviceTime]
         return result
 
 # 用于KM算法的组合配对

+ 2 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -1984,7 +1984,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             # "联系人"正则补充提取  2021/11/15 新增
             list_person_text = [entity.entity_text for entity in list_sentence_entitys if entity.entity_type=='person']
             error_text = ['交易','机构','教育','项目','公司','中标','开标','截标','监督','政府','国家','中国','技术','投标','传真','网址','电子邮',
-                          '联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理','代理人','采购','附件']
+                          '联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理',
+                          '代理人','采购','附件','注意','登录','报名','踏勘']
             list_person_text = set(list_person_text + error_text)
             re_person = re.compile("联系人[::]([\u4e00-\u9fa5]工)|"
                                    "联系人[::]([\u4e00-\u9fa5]{2,3})(?=联系)|"

+ 260 - 143
BiddingKG/dl/interface/getAttributes.py

@@ -768,6 +768,26 @@ def getPackagesFromArticle(list_sentence,list_entity):
                 PackageList.append(copy_pack)
     return PackageList,PackageSet,dict_packageCode
 
+# km配对方法
+def dispatch(match_list):
+    main_roles = list(set([match.main_role for match in match_list]))
+    attributes = list(set([match.attribute for match in match_list]))
+
+    label = np.zeros(shape=(len(main_roles), len(attributes)))
+    for match in match_list:
+        main_role = match.main_role
+        attribute = match.attribute
+        value = match.value
+        label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
+    # print(label)
+    gragh = -label
+    # km算法
+    row, col = linear_sum_assignment(gragh)
+    max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
+    # return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
+    return [(main_roles[row], attributes[col]) for row, col in max_dispatch]
+
+from BiddingKG.dl.common.Utils import getUnifyMoney
 from BiddingKG.dl.interface.modelFactory import Model_relation_extraction
 relationExtraction_model = Model_relation_extraction()
 def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity,list_sentence,on_value = 0.5,on_value_person=0.5,sentence_len=4):
@@ -812,7 +832,15 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                     packDict[packageName]["roleList"][i].money_unit = money.money_unit
                 # print('链接中的金额:{0}, 单位:{1}'.format(money.entity_text, money.money_unit))
         return packDict
-    
+    def addRatioByEntity(packDict,packageName,entity,ratio):
+        for i in range(len(packDict[packageName]["roleList"])):
+            if packDict[packageName]["roleList"][i].entity_text==entity:
+                packDict[packageName]["roleList"][i].ratio = ratio.entity_text
+    def addServiceTimeByEntity(packDict,packageName,entity,serviceTime):
+        for i in range(len(packDict[packageName]["roleList"])):
+            if packDict[packageName]["roleList"][i].entity_text==entity:
+                packDict[packageName]["roleList"][i].serviceTime = serviceTime.entity_text
+
     #根据实体名称得到角色
     def getRoleWithText(packDict,entity_text):
         for pack in packDict.keys():
@@ -838,8 +866,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                 # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000')
     
     #遍历所有实体
-    while(p_entity<len(list_entity)):
-        entity = list_entity[p_entity]
+    # while(p_entity<len(list_entity)):
+    #     entity = list_entity[p_entity]
         '''
         #招标金额从后往前找
         if entity.entity_type=="money":
@@ -902,88 +930,206 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
 
         #如果实体属于角色集合,则往后找属性
-        if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
-            
-            p_entity += 1
-            #循环查找符合的属性
-            while(p_entity<len(list_entity)):
-                
-                entity_after = list_entity[p_entity]
-                if entity_after.sentence_index-entity.sentence_index>=sentence_len:
-                    p_entity -= 1
-                    break
-                #若是遇到公司实体,则跳出循环
-                if entity_after.entity_type in ['org','company']:
-                    p_entity -= 1
-                    break
-                if entity_after.values is not None:
-                    if entity_after.entity_type=="money":
-                        if entity_after.values[entity_after.label]>=on_value:
-                            '''
-                            #招标金额从后往前找
-                            if str(entity_after.label)=="0":
-                                packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
-                                if packagePointer is None:
-                                    packageName = "Project"
+        # if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
+        #
+        #     p_entity += 1
+        #     #循环查找符合的属性
+        #     while(p_entity<len(list_entity)):
+        #
+        #         entity_after = list_entity[p_entity]
+        #         if entity_after.sentence_index-entity.sentence_index>=sentence_len:
+        #             p_entity -= 1
+        #             break
+        #         #若是遇到公司实体,则跳出循环
+        #         if entity_after.entity_type in ['org','company']:
+        #             p_entity -= 1
+        #             break
+        #         if entity_after.values is not None:
+        #             if entity_after.entity_type=="money":
+        #                 if entity_after.values[entity_after.label]>=on_value:
+        #                     '''
+        #                     #招标金额从后往前找
+        #                     if str(entity_after.label)=="0":
+        #                         packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
+        #                         if packagePointer is None:
+        #                             packageName = "Project"
+        #                         else:
+        #                             packageName = packagePointer.entity_text
+        #                         addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
+        #                     '''
+        #                     if str(entity_after.label)=="1":
+        #                         #print(entity_after.entity_text,entity.entity_text)
+        #                         _list_entitys = [entity]+entity.linked_entitys
+        #                         if len(PackageSet)>0:
+        #                             packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
+        #                             if packagePointer is None:
+        #                                 packageName_entity = "Project"
+        #                             else:
+        #                                 packageName_entity = packagePointer.entity_text
+        #                         else:
+        #                             packageName_entity = "Project"
+        #                         if str(entity.label) in ["2","3","4"]:
+        #                             # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
+        #                             if entity_after.notes == '单价' or float(entity_after.entity_text)<5000: #2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况
+        #                                 addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
+        #                                                  0.5)
+        #                                 entity.pointer_money = entity_after
+        #                                 # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
+        #                             else:
+        #                                 addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
+        #                                                  entity_after.values[entity_after.label])
+        #                                 entity.pointer_money = entity_after
+        #                                 # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
+        #                                 if entity_after.values[entity_after.label]>0.6:
+        #                                     break # 2021/7/16 新增,找到中标金额,非单价即停止,不再往后找金额
+        #                             #add pointer_money
+        #                             # entity.pointer_money = entity_after
+        #                             # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
+        #                             # if entity_after.notes!='单价':
+        #                             #     break  # 2021/7/16 新增,找到中标金额即停止,不再往后找金额
+        #                 '''
+        #             if entity_after.entity_type=="person":
+        #                 if entity_after.values[entity_after.label]>=on_value_person:
+        #                     if str(entity_after.label)=="1":
+        #                         for i in range(len(roleList)):
+        #                             if roleList[i].role_name=="tenderee":
+        #                                 roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
+        #                     elif str(entity_after.label)=="2":
+        #                         for i in range(len(roleList)):
+        #                             if roleList[i].role_name=="agency":
+        #                                 roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
+        #                     elif str(entity_after.label)=="3":
+        #                         _list_entitys = [entity]+entity.linked_entitys
+        #                         for _entity in _list_entitys:
+        #                             for i in range(len(roleList)):
+        #                                 if roleList[i].entity_text==_entity.entity_text:
+        #                                     if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
+        #                                         break
+        #                                     roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
+        #             '''
+        #
+        #         p_entity += 1
+        #
+        # p_entity += 1
+    # 记录每句的分词数量
+    tokens_num_dict = dict()
+    last_tokens_num = 0
+    for sentence in list_sentence:
+        _index = sentence.sentence_index
+        if _index == 0:
+            tokens_num_dict[_index] = 0
+        else:
+            tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num
+        last_tokens_num = len(sentence.tokens)
+    attribute_type = ['money','serviceTime','ratio']# 'money'仅指“中投标金额”
+    for link_attribute in attribute_type:
+        temp_entity_list = []
+        if link_attribute=="money":
+            temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
+                                (ent.entity_type=='money' and ent.label==1 and ent.values[ent.label]>=0.5)]
+            # 删除重复的‘中投标金额’,一般为大小写两种样式
+            drop_tendererMoney = []
+            for ent_idx in range(len(temp_entity_list)-1):
+                entity = temp_entity_list[ent_idx]
+                if entity.entity_type=='money':
+                    next_entity = temp_entity_list[ent_idx+1]
+                    if next_entity.entity_type=='money':
+                        if getUnifyMoney(entity.entity_text)==getUnifyMoney(next_entity.entity_text):
+                            if (tokens_num_dict[next_entity.sentence_index] + next_entity.begin_index) - (
+                                               tokens_num_dict[entity.sentence_index] + entity.end_index) < 10:
+                                drop_tendererMoney.append(next_entity)
+            for _drop in drop_tendererMoney:
+                temp_entity_list.remove(_drop)
+        elif link_attribute=="serviceTime":
+            temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
+                                ent.entity_type=='serviceTime']
+        elif link_attribute=="ratio":
+            temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
+                                ent.entity_type=='ratio']
+        temp_entity_list = sorted(temp_entity_list,key=lambda x: (x.sentence_index, x.begin_index))
+        temp_match_list = []
+        for ent_idx in range(len(temp_entity_list)):
+            entity = temp_entity_list[ent_idx]
+            if entity.entity_type in ['org','company']:
+                match_nums = 0
+                tenderer_nums = 0 #经过其他中投标人的数量
+                byNotTenderer_match_nums = 0 #跟在中投标人后面的属性
+                for after_index in range(ent_idx + 1, min(len(temp_entity_list), ent_idx + 4)):
+                    after_entity = temp_entity_list[after_index]
+                    if after_entity.entity_type == link_attribute:
+                        distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
+                                           tokens_num_dict[entity.sentence_index] + entity.end_index)
+                        sentence_distance = after_entity.sentence_index - entity.sentence_index
+                        if sentence_distance == 0:
+                            if distance < 100:
+                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                temp_match_list.append(Match(entity, after_entity, value))
+                                match_nums += 1
+                                if not tenderer_nums:
+                                    byNotTenderer_match_nums += 1
                                 else:
-                                    packageName = packagePointer.entity_text
-                                addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
-                            '''
-                            if str(entity_after.label)=="1":
-                                #print(entity_after.entity_text,entity.entity_text)
-                                _list_entitys = [entity]+entity.linked_entitys
-                                if len(PackageSet)>0:
-                                    packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
-                                    if packagePointer is None:
-                                        packageName_entity = "Project"
-                                    else:
-                                        packageName_entity = packagePointer.entity_text
+                                    break
+                        else:
+                            if distance < 60:
+                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                temp_match_list.append(Match(entity, after_entity, value))
+                                match_nums += 1
+                                if not tenderer_nums:
+                                    byNotTenderer_match_nums += 1
                                 else:
-                                    packageName_entity = "Project"
-                                if str(entity.label) in ["2","3","4"]:
-                                    # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
-                                    if entity_after.notes == '单价' or float(entity_after.entity_text)<5000: #2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况
-                                        addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
-                                                         0.5)
-                                        entity.pointer_money = entity_after
-                                        # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
-                                    else:
-                                        addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
-                                                         entity_after.values[entity_after.label])
-                                        entity.pointer_money = entity_after
-                                        # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
-                                        if entity_after.values[entity_after.label]>0.6:
-                                            break # 2021/7/16 新增,找到中标金额,非单价即停止,不再往后找金额
-                                    #add pointer_money
-                                    # entity.pointer_money = entity_after
-                                    # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
-                                    # if entity_after.notes!='单价':
-                                    #     break  # 2021/7/16 新增,找到中标金额即停止,不再往后找金额
-                        '''
-                    if entity_after.entity_type=="person":
-                        if entity_after.values[entity_after.label]>=on_value_person:
-                            if str(entity_after.label)=="1":
-                                for i in range(len(roleList)):
-                                    if roleList[i].role_name=="tenderee":
-                                        roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
-                            elif str(entity_after.label)=="2":
-                                for i in range(len(roleList)):
-                                    if roleList[i].role_name=="agency":
-                                        roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
-                            elif str(entity_after.label)=="3":
-                                _list_entitys = [entity]+entity.linked_entitys
-                                for _entity in _list_entitys:
-                                    for i in range(len(roleList)):
-                                        if roleList[i].entity_text==_entity.entity_text:
-                                            if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
-                                                break
-                                            roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
-                    '''
-                    
-                p_entity += 1  
-                
-        p_entity += 1
-    
+                                    break
+                    else:
+                        tenderer_nums += 1
+                #前向查找属性
+                if not match_nums or not byNotTenderer_match_nums:
+                    previous_entity = temp_entity_list[ent_idx - 1]
+                    if previous_entity.entity_type == link_attribute:
+                        if previous_entity.sentence_index == entity.sentence_index:
+                            distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
+                                    tokens_num_dict[
+                                        previous_entity.sentence_index] + previous_entity.end_index)
+                            if distance < 20:
+                                # 前向 没有 /10000
+                                value = (-1 / 2 * (distance ** 2))
+                                temp_match_list.append(Match(entity, previous_entity, value))
+        # km算法分配求解
+        dispatch_result = dispatch(temp_match_list)
+        # print(dispatch_result)
+        for match in dispatch_result:
+            _entity = match[0]
+            _attribute = match[1]
+            if link_attribute=='money':
+                _entity.pointer_money = _attribute
+                packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
+                                               "money-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
+                if packagePointer is None:
+                    packageName_entity = "Project"
+                else:
+                    packageName_entity = packagePointer.entity_text
+                if _attribute.notes == '单价' or float(_attribute.entity_text) < 5000:  # 2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况
+                    addMoneyByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute,0.5)
+                else:
+                    addMoneyByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute,
+                                     _attribute.values[_attribute.label])
+            elif link_attribute=='serviceTime':
+                _entity.pointer_serviceTime = _attribute
+                packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
+                                               "serviceTime-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
+                if packagePointer is None:
+                    packageName_entity = "Project"
+                else:
+                    packageName_entity = packagePointer.entity_text
+                addServiceTimeByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute)
+            elif link_attribute=='ratio':
+                _entity.pointer_ratio = _attribute
+                packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
+                                               "ratio-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
+                if packagePointer is None:
+                    packageName_entity = "Project"
+                else:
+                    packageName_entity = packagePointer.entity_text
+                addRatioByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute)
+
     ''''''
     # 通过模型分类的招标/代理联系人
     list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
@@ -1073,24 +1219,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                     for one_phone in _phone:
                         PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
                         agency_phone.add(one_phone)
-    # km配对方法
-    def dispatch(match_list):
-        main_roles = list(set([match.main_role for match in match_list]))
-        attributes = list(set([match.attribute for match in match_list]))
-
-        label = np.zeros(shape=(len(main_roles), len(attributes)))
-        for match in match_list:
-            main_role = match.main_role
-            attribute = match.attribute
-            value = match.value
-            label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
-        # print(label)
-        gragh = -label
-        # km算法
-        row, col = linear_sum_assignment(gragh)
-        max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
-        # return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
-        return [(main_roles[row], attributes[col]) for row, col in max_dispatch]
 
     # 正则提取电话号码实体
     # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
@@ -1193,15 +1321,15 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
         # 去重结果
         relation_list = list(set(relation_list))
     # print(relation_list)
-    tokens_num_dict = dict()
-    last_tokens_num = 0
-    for sentence in list_sentence:
-        _index = sentence.sentence_index
-        if _index == 0:
-            tokens_num_dict[_index] = 0
-        else:
-            tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num
-        last_tokens_num = len(sentence.tokens)
+    # tokens_num_dict = dict()
+    # last_tokens_num = 0
+    # for sentence in list_sentence:
+    #     _index = sentence.sentence_index
+    #     if _index == 0:
+    #         tokens_num_dict[_index] = 0
+    #     else:
+    #         tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num
+    #     last_tokens_num = len(sentence.tokens)
     right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
     linked_company = set()
     linked_person = set()
@@ -1570,7 +1698,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                                                 match_nums += 1
                         # 实体无匹配时,尝试前向查找匹配
                         if not match_nums:
-                            if entity.label != 5 and entity.values[entity.label] > 0.5 and index != 0:
+                            if (entity.label != 5 or entity.entity_text in roleSet) and entity.values[entity.label] >= 0.5 and index != 0:
                                 previous_entity = split_entitys[index - 1]
                                 if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]:
                                     if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
@@ -2127,14 +2255,27 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                     PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
                     # print('通过其他中标人投标金额校正中标金额')
 
+    for item in list_pop:
+        PackDict.pop(item)
+
+    # 公告中只有"招标人"且无"联系人"链接时,直接取文中倒数第一个联系人
+    if len(PackDict)==1:
+        k = list(PackDict.keys())[0]
+        if len(PackDict[k]["roleList"])==1:
+            if PackDict[k]["roleList"][0].role_name == "tenderee":
+                if not PackDict[k]["roleList"][0].linklist:
+                    for _entity in temporary_list2[::-1]:
+                        if _entity.entity_type=='person' and _entity.label==3:
+                            if _entity.person_phone:
+                                _phone = [p.entity_text for p in _entity.person_phone]
+                                for _p in _phone:
+                                    PackDict[k]["roleList"][0].linklist.append((_entity.entity_text, _p))
+                                break
 
     for pack in PackDict.keys():
         for i in range(len(PackDict[pack]["roleList"])):
             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
 
-    for item in list_pop:
-        PackDict.pop(item)
-        
     return PackDict 
 
 def initPackageAttr(RoleList,PackageSet):
@@ -2241,9 +2382,9 @@ def getTimeAttributes(list_entity,list_sentence):
     time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index))
     list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
     dict_time = {
-        "time_release": [],
-        "time_bidopen": [],
-        "time_bidclose": [],
+        "time_release": [], # 1 发布时间
+        "time_bidopen": [], # 2 开标时间
+        "time_bidclose": [], # 3 截标时间
         'time_bidstart': [],  # 12 投标(开始)时间、响应文件接收(开始)时间
 
         'time_publicityStart': [],  # 4 公示开始时间(公示时间、公示期)
@@ -2424,18 +2565,11 @@ def getTimeAttributes(list_entity,list_sentence):
 def getOtherAttributes(list_entity):
     dict_other = {"moneysource":"",
                   "person_review":[],
-                  # "time_release":"",
-                  # "time_bidopen":"",
-                  # "time_bidclose":"",
                   "serviceTime":"",
                   "product":[],
                   "total_tendereeMoney":0,
                   "total_tendereeMoneyUnit":''}
-    # dict_time = {
-    #     "time_release": [],
-    #     "time_bidopen": [],
-    #     "time_bidclose": []
-    # }
+
     for entity in list_entity:
         if entity.entity_type == 'bidway':
             dict_other["bidway"] = turnBidWay(entity.entity_text)
@@ -2443,18 +2577,6 @@ def getOtherAttributes(list_entity):
             dict_other["moneysource"] = entity.entity_text
         elif entity.entity_type=='serviceTime':
             dict_other["serviceTime"] = entity.entity_text
-        # elif entity.entity_type == 'time' and entity.label==1:
-        #     if entity.values[entity.label]>0.6:
-        #         dict_time['time_release'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
-        #     # dict_other["time_release"] = timeFormat(entity.entity_text)
-        # elif entity.entity_type == 'time' and entity.label==2:
-        #     if entity.values[entity.label]>0.6:
-        #         dict_time['time_bidopen'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
-        #     # dict_other["time_bidopen"] = timeFormat(entity.entity_text)
-        # elif entity.entity_type == 'time' and entity.label == 3:
-        #     if entity.values[entity.label]>0.6:
-        #         dict_time['time_bidclose'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
-        #     # dict_other["time_bidclose"] = timeFormat(entity.entity_text)
         elif entity.entity_type=="person" and entity.label ==4:
             dict_other["person_review"].append(entity.entity_text)
         elif entity.entity_type=='product':
@@ -2462,12 +2584,7 @@ def getOtherAttributes(list_entity):
         elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
                 dict_other["total_tendereeMoney"] = float(entity.entity_text)
                 dict_other["total_tendereeMoneyUnit"] = entity.money_unit
-    # 时间类别
-    # for time_type,value in dict_time.items():
-    #     list_time = dict_time[time_type]
-    #     if list_time:
-    #         list_time.sort(key=lambda x:x[1],reverse=True)
-    #         dict_other[time_type] = list_time[0][0]
+
     dict_other["product"] = list(set(dict_other["product"]))
     return dict_other
 

+ 1 - 1
BiddingKG/dl/interface/predictor.py

@@ -794,7 +794,7 @@ class EPCPredict():
                 values.append(item)
             # phone_number = phone[i]
             # entity.set_Person(label,values,phone_number)
-            entity.set_Person(label,values,None)
+            entity.set_Person(label,values,[])
         # 为联系人匹配电话
         # self.person_search_phone(list_sentences, list_entitys)
 

+ 1 - 2
BiddingKG/dl/metrics/extractMetric.py

@@ -259,7 +259,6 @@ class ExtractMetric():
     def extractFromInterface(self,content):
         return json.loads(test("",content))
 
-
     def getDiff(self,_inter,_inter2):
         _dict = {}
         for k in ["code","product","person_review"]:
@@ -367,7 +366,7 @@ class ExtractMetric():
 
     def getMetrics(self,list_diff):
         dict_key_count = {}
-        print("all_count:",list_diff)
+        # print("all_count:",list_diff)
         for _diff in list_diff:
             for k,v in _diff.items():
                 if k not in dict_key_count:

+ 4 - 5
BiddingKG/dl/test/test4.py

@@ -46,7 +46,7 @@ def test(name,content):
 if __name__=="__main__":
     # filename = "比地_52_79929693.html"
     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
-    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
+    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\test12354.txt","r",encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div",id="pcontent"))
     # df_a = {"html":[]}
     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
@@ -74,10 +74,9 @@ if __name__=="__main__":
     # 广州比地数据科技有限公司翻译服务工程招标
     # '''
     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
-
     # print(predict("12", content,"打印机"))
-    a = time.time()
-    print(predict("12", content,"打印机"))
+    # print(predict("12", text,"打印机"))
     # test(12,content)
-    print("takes",time.time()-a)
+    test(12,text)
+    print("takes",time.time()-_time1)
     pass

+ 15 - 2
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -136,21 +136,34 @@ def predict(doc_id,text):
                 print(entity.entity_text, entity.begin_index, entity.end_index)
             elif entity.entity_type in ['org','company']:
                 _sentence = list_sentences[0][entity.sentence_index]
+                print(entity.entity_type)
                 if entity.pointer_person:
                     print("公司->联系人1:",end=' ')
                     print(entity.entity_text,[i.entity_text for i in entity.pointer_person],entity.label,entity.values)
                     # print(entity.entity_text,entity.label,entity.values)
-                    # print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
+                    print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
                 else:
                     print("公司->联系人2:", end=' ')
                     print(entity.entity_text, entity.pointer_person,entity.label,entity.values)
-                    # print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
+                    print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
                     pass
+                if entity.label in [2,3,4]:
+                    if entity.pointer_money:
+                        print("公司->中投标金额:", end=' ')
+                        print(entity.entity_text, entity.pointer_money.entity_text)
+                    if entity.pointer_serviceTime:
+                        print("公司->工期:", end=' ')
+                        print(entity.entity_text, entity.pointer_serviceTime.entity_text)
+                    if entity.pointer_ratio:
+                        print("公司->费率:", end=' ')
+                        print(entity.entity_text, entity.pointer_ratio.entity_text)
                 # print(entity.pointer_pack)
             # elif entity.entity_type =='serviceTime':
             #     print(entity.entity_text)
             #     if entity.pointer_pack:
             #         print('pointer_pack_name:',entity.pointer_pack.entity_text)
+            # elif entity.entity_type =='money':
+            #     print('money',entity.entity_text,entity.label)
             # elif entity.entity_type in ['package']:
             #     print('pack_entity:',entity.entity_text)
             # print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)