Selaa lähdekoodia

优化角色、金额、多中标人、表格角色等提取,优化表格提取结果与prem合并逻辑

lsm 1 vuosi sitten
vanhempi
commit
c2050e31ff

+ 1 - 1
BiddingKG/dl/entityLink/entityLink.py

@@ -168,7 +168,7 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
                     if have_bus:
                         lb, prob = get_role(dic)
                         bus_dic[_entity.entity_text] = (lb, prob)
-                        if lb == 0 and prob > 0.9 and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']:
+                        if lb == 0 and prob > 0.9 and re.search('医院|学院|学校|中学|小学|大学|中心|幼儿园|保健院|党校|银行|研究院|血站|分校|红十字会|防治院|研究所', _entity.entity_text) and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']:
                             bus_tenderee.append(_entity)
                 if have_bus: # 20231115 改为只判断是否有工商数据,没有就考虑替换
                     long_entity.append(_entity)

+ 6 - 1
BiddingKG/dl/interface/Entitys.py

@@ -258,6 +258,7 @@ class PREM():
         self.money = money
         self.money_prob = money_prob
         self.linklist = linklist
+        self.multi_winner = set() # 2024/4/8 #添加多中标人
         
     def getString(self,roleList):
         '''
@@ -285,7 +286,7 @@ class Role():
     @summary: 定义一个角色拥有的所有属性
     '''
     
-    def __init__(self,role_name,entity_text,role_prob,money,money_prob,linklist):
+    def __init__(self,role_name,entity_text,role_prob,money,money_prob,linklist, multi_winner):
         
         self.role_name = role_name
         self.entity_text = entity_text
@@ -298,6 +299,7 @@ class Role():
         self.ratio = None #2022/01/06 新增 保存中投标金额相关费率 (ratio_value,ratio_type)
         self.serviceTime = "" #2021/01/06 新增 保存服务期限(工期)
         self.address = ""  #2022/08/08 新增 角色地址
+        self.multi_winner = multi_winner #2024/4/8 新增多中标人
 
     def getString(self):
         self.linklist = [item for item in set(self.linklist)]
@@ -342,6 +344,9 @@ class Role():
                   'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}
         if result['role_name'] == 'tenderee':
             result['role_prob'] = self.role_prob
+        if result['role_name'] == 'win_tenderer' and self.multi_winner != set():
+            self.multi_winner.add(result['role_text'])
+            result['multi_winner'] = ','.join(self.multi_winner)
         return result
 
 # 用于KM算法的组合配对

+ 19 - 18
BiddingKG/dl/interface/Preprocessing.py

@@ -794,7 +794,7 @@ def tableToText(soup):
     def getTableText(inner_table,head_list,key_direct=False):
         # packPattern = "(标包|[标包][号段名])"
         packPattern = "(标包|标的|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则,补充采购类包名
-        rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标|推荐意见|评标情况|推荐顺序)"  # 2020/11/23 大网站规则,添加序号为排序
+        rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标|推荐意见|评标情况|推荐顺序|选取(情况|说明))"  # 2020/11/23 大网站规则,添加序号为排序
         entityPattern = "((候选|[中投]标|报价)(单位|公司|人|供应商))|供应商名称"
         moneyPattern = "([中投]标|报价)(金额|价)"
         height = len(inner_table)
@@ -901,6 +901,8 @@ def tableToText(soup):
                         money_text = ""
                         #在同一句话中重复的可以去掉
                         text_set = set()
+                        head = ""
+                        last_text = ""
                         for j in range(width):
                             cell = table_occurence[i][j]
                             if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
@@ -928,9 +930,14 @@ def tableToText(soup):
                                     else:
                                         text_line += head+cell["text"]+","
                                 text_set.add(str(head+cell["text"]))
+                                last_text = cell['text']
 
                         text += pack_text+rank_text+entity_text+money_text+text_line
-                        text = text[:-1]+"。" if len(text)>0 else text
+                        # text = text[:-1] + "。" if len(text) > 0 else text
+                        if len(text_set)==1 and head == '' and len(last_text)< 20 and (re.search('[::]$', last_text) or re.search('[一二三四五六七八九十\d]+[、.]\w{2,}', last_text)):
+                            text = text if re.search('\w$', text[:-1]) else text[:-1]
+                        else:
+                            text = text[:-1] + "。"
 
                 else:
                     for j in range(occu_width):
@@ -2537,17 +2544,13 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
     entity_type = "money"
     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
                           "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
-                          "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
+                          "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
-    # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
-
-    # pattern_money = re.compile("%s|%s|%s|%s" % (
-    # list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
-    # list_money_pattern["front_m"]))
+    # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元,合同金额:378.8万元 提取
 
-    pattern_money = re.compile("%s|%s|%s" % (
-        list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"]
-        )) # 2024/01/30  改为 list_money_pattern["front_m"] 单独搜索,避免与 key_word 冲突  详见合同元,合同金额:378.8万元,
+    pattern_money = re.compile("%s|%s|%s|%s" % (
+    list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
+    list_money_pattern["front_m"]))
 
     if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
         found_yeji += 1
@@ -2558,8 +2561,6 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
         if ser:
             sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
         all_match = re.finditer(pattern_money, sentence_text)
-        if all_match == None:
-            all_match = re.finditer(list_money_pattern["front_m"], sentence_text)
     # print('all_match:', all_match)
     for _match in all_match:
         # print('_match: ', _match.group())
@@ -2867,10 +2868,10 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 elif entity_type=="person" and len(entity_text)>10 and len(re.findall("[\u4e00-\u9fa5]",entity_text))<len(entity_text)/2:
                     continue
                 # 识别不完整的组织机构补充
-                if entity_type in ["org"]:
-                    end_words = re.search("^[\u4e00-\u9fa5]{,5}(?:办公室|部|中心|处|会)",sentence_text[end_index_temp:end_index_temp+10])
-                    if end_words:
-                        entity_text = entity_text + end_words.group()
+                # if entity_type in ["org"]:
+                #     end_words = re.search("^[\u4e00-\u9fa5]{,5}(?:办公室|部|中心|处|会)",sentence_text[end_index_temp:end_index_temp+10])  # 2024/4/7 注释掉 273356356 江门市新会区大鳌镇农村集体资产资源交易中心受新会
+                #     if end_words:
+                #         entity_text = entity_text + end_words.group()
 
                 for j in range(len(list_tokenbegin)):
                     if list_tokenbegin[j]==begin_index_temp:
@@ -2888,7 +2889,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
                 #去掉标点符号
                 if entity_type!='time':
-                    entity_text = re.sub("[,,。:!&@$\*\s]","",entity_text)
+                    entity_text = re.sub("[,,。:!&@$\*\s;;]","",entity_text) # 215553737
                 entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
                 # 组织机构实体名称补充
                 if entity_type in ["org", "company"]:

BIN
BiddingKG/dl/interface/agency_set.pkl


+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -349,7 +349,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-03-28'}
+    version_date = {'version_date': '2024-04-16'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
 
     '''最终检查修正招标、中标金额'''

+ 61 - 30
BiddingKG/dl/interface/getAttributes.py

@@ -311,7 +311,7 @@ def get_legal_comba(list_entity,dict_role_combination):
                 _dict = dict()
                 for _key in item.keys():
                     _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
-                _prob, max_role_prob = getSumExpectation(dict_pack_entity_prob, _dict)
+                _prob = getSumExpectation(dict_pack_entity_prob, _dict)
                 if _prob>MAX_PROB:
                     MAX_PROB = _prob
                     _MAX_PROB_COMBA = [item]
@@ -396,14 +396,11 @@ def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
     '''
     #修改为同一个实体只取对应包-角色的最大的概率值
     expect = 0
-    max_prob = 0
     dict_entity_prob = {}
     for _key_pack_entity in dict_pack_entity_prob:
         _key_pack = _key_pack_entity.split("$text$")[0]
         role_prob = dict_pack_entity_prob[_key_pack_entity][1]
         if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
-            if role_prob > max_prob:
-                max_prob = role_prob
             if _key_pack_entity in dict_entity_prob.keys():
                 if dict_entity_prob[_key_pack_entity]<role_prob:
                     dict_entity_prob[_key_pack_entity] = role_prob
@@ -438,7 +435,7 @@ def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
     for _key in dict_entity_prob.keys():
         symbol = 1 if dict_entity_prob[_key]>0 else -1 
         expect += symbol*math.pow(dict_entity_prob[_key],2)
-    return expect, max_prob
+    return expect
 
 
 def getRoleList(list_sentence,list_entity,on_value = 0.5):
@@ -458,6 +455,8 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
 
     #拿到所有可能的情况
     dict_role_combination = {}
+    tenderee_or_agency_set = set() # 记录所有预测为招标或代理的实体集合
+    win_tenderer_set = set() # 记录所有预测为中标的实体集合
   # print(PackageList)
     #拿到各个实体的packageName,packageCode
     for entity in list_entity:
@@ -520,9 +519,7 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
     _index = 0
     dict_pack_entity_prob = get_dict_entity_prob(list_entity)
     for item_combination in list_real_comba:
-        expect, max_role_prob = getSumExpectation(dict_pack_entity_prob, item_combination)
-        for k, v in item_combination.items():
-            item_combination[k] = [v, max_role_prob]
+        expect = getSumExpectation(dict_pack_entity_prob, item_combination)
         if expect>max_expect:
             max_index = _index
             max_expect = expect
@@ -534,14 +531,30 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
             packageName = _key.split("$$")[0]
             label = _key.split("$$")[1]
             role_name = dict_role_id.get(str(label))
-            # entity_text = list_real_comba[max_index][_key]
-            entity_text = list_real_comba[max_index][_key][0]
-            entity_prob = list_real_comba[max_index][_key][1]
+            entity_text = list_real_comba[max_index][_key]
+            entity_prob = dict_pack_entity_prob.get(_key+'$text$'+entity_text, ['',0])[1]
+            # entity_text = list_real_comba[max_index][_key][0]
+            # entity_prob = list_real_comba[max_index][_key][1]
             if packageName in dict_PackageCode.keys():
                 packagecode = dict_PackageCode.get(packageName)
             else:
                 packagecode = ""
             RoleList.append(PREM(packageName,packagecode,role_name,entity_text,entity_prob,0,0.0,[]))
+            if str(label) in ["0", "1"]:
+                tenderee_or_agency_set.add(entity_text)
+            elif str(label) in ["2"] and entity_prob > 0.8:
+                win_tenderer_set.add(entity_text)
+
+            if len(list_real_comba) > 1 and label == '2':
+                multi_winner = []
+                for comba in list_real_comba:
+                    tmp_ent = comba.get(_key, '')
+                    tmp_prob = dict_pack_entity_prob.get(_key+'$text$'+tmp_ent, ['',0])[1]
+                    if tmp_ent !='' and tmp_prob>0.8:
+                        multi_winner.append(comba[_key])
+                if len(set(multi_winner)) > 1:
+                    RoleList[-1].multi_winner = multi_winner
+            # print('RoleList: ', RoleList)
             RoleSet.add(entity_text)
 
     #根据最优树来修正list_entity中角色对包的连接
@@ -554,7 +567,7 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
                     _find_flag = True
             if not _find_flag:
                 _entity.pointer_pack = None
-    return RoleList,RoleSet,PackageList,PackageSet
+    return RoleList,RoleSet,PackageList,PackageSet,win_tenderer_set,tenderee_or_agency_set
 
 def getPackageScopePattern():
     '''
@@ -690,6 +703,9 @@ def getPackagesFromArticle(list_sentence, list_entity):
                 elif re.search('同一(标段?|包)', content[max(0, iter.start()-2):iter.end()]):  # 不得参加同一标段
                     # print('过滤掉错误包:', iter.group())
                     continue
+                elif re.search('三包', content[max(0, iter.start()-2):iter.end()]) and re.search('第三包', content[max(0, iter.start()-2):iter.end()])==None:  # 规规章和“三包”规定
+                    # print('过滤掉错误包:', iter.group())
+                    continue
                 elif re.search('[1-9]\d{2,}$|\d{4,}|^[1-9]\d{2,}|合同包[A-Za-z]{2,}', iter.group(0)):
                     # print('过滤掉错误包号5:', iter.group(0))
                     continue
@@ -2870,7 +2886,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
     return PackDict
 
-def initPackageAttr(RoleList,PackageSet):
+def initPackageAttr(RoleList,PackageSet,win_tenderer_set,tenderee_or_agency_set):
     '''
     @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
     '''   
@@ -2883,7 +2899,7 @@ def initPackageAttr(RoleList,PackageSet):
             packDict[item.packageName]["code"] = item.packageCode
         # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
         # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位)
-        packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,item.role_prob,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位)
+        packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,item.role_prob,0,0.0,[],set(item.multi_winner)-win_tenderer_set-tenderee_or_agency_set)) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,多中标人)
     return packDict
                 
 def getPackageRoleMoney(list_sentence,list_entity,list_outline):
@@ -2897,12 +2913,14 @@ def getPackageRoleMoney(list_sentence,list_entity,list_outline):
     theRole = getRoleList(list_sentence,list_entity)
     if not theRole:
         return []
-    RoleList,RoleSet,PackageList,PackageSet = theRole
+    # RoleList,RoleSet,PackageList,PackageSet = theRole
+    RoleList,RoleSet,PackageList,PackageSet,win_tenderer_set,tenderee_or_agency_set = theRole
     '''
     for item in PackageList:
         # print(item)
     '''
-    PackDict = initPackageAttr(RoleList, PackageSet)
+    # PackDict = initPackageAttr(RoleList, PackageSet)
+    PackDict = initPackageAttr(RoleList, PackageSet, win_tenderer_set,tenderee_or_agency_set)
 
     PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline)
     return PackDict
@@ -3864,7 +3882,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
     '''
     moneys = []
     moneys_attachment = []
-    if channel_dic['docchannel']['docchannel']=='中标信息' and 'win_tenderer' in str(prem):
+    if channel_dic['docchannel']['docchannel'] in ['中标信息','候选人公示','合同公告'] and 'win_tenderer' in str(prem):
         sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index)
         finalists = [] # 入围供应商
         i = 0
@@ -3872,7 +3890,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
             ent = list_entitys[0][i]
             b_idx_fr = ent.wordOffset_begin
             e_idx_fr = ent.wordOffset_end
-            multi_winner_l = [ent.entity_text]
+            multi_winner_l = []
             i += 1
             if ent.entity_type in ['money']:
                 money = float(ent.entity_text)
@@ -3880,7 +3898,8 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                     moneys_attachment.append(money)
                 else:
                     moneys.append(money)
-            if ent.entity_type in ['org', 'company'] and ent.label == 2 and ent.values[ent.label]>0.5:
+            if ent.entity_type in ['org', 'company'] and ent.label == 2 and ent.values[ent.label]>0.8:
+                multi_winner_l.append(ent.entity_text)
                 sentence_text = sentences[ent.sentence_index].sentence_text
                 pre_text = sentence_text[max(0, b_idx_fr-10):b_idx_fr]
                 if re.search('入围', pre_text) and re.search('未入围', pre_text)==None and ent.entity_text not in finalists:
@@ -3891,12 +3910,16 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                     e_idx_bh = ent_bh.wordOffset_end
                     if ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh-e_idx_fr==1:
                         sentence_text = sentences[ent_bh.sentence_index].sentence_text
-                        if sentence_text[e_idx_fr:b_idx_bh] in [';','、'] and (len(sentence_text)==e_idx_bh or sentence_text[e_idx_bh] in ['、', ',', '。']): # 修复多中标人刚好在文末index超出报错,例子 407126558
+                        if sentence_text[e_idx_fr:b_idx_bh] in [';','、','&',','] and (len(sentence_text)==e_idx_bh or sentence_text[e_idx_bh] in [';','','&', ',', '。']): # 修复多中标人刚好在文末index超出报错,例子 407126558
                             multi_winner_l.append(ent_bh.entity_text)
                             e_idx_fr = e_idx_bh
                             i = j + 1
                         else:
                             break
+                    elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh==e_idx_fr:
+                        multi_winner_l.append(ent_bh.entity_text)
+                        e_idx_fr = e_idx_bh
+                        i = j + 1
                     else:
                         break
             if len(multi_winner_l)>=2:
@@ -3905,12 +3928,9 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                         continue
                     for v in project.values():
                         for d in v['roleList']:
-                            if d.get('role_name', '') == 'win_tenderer':
-                                winner = d.get('role_text')
-                                if winner == multi_winner_l[0]:
-                                    d['multi_winner'] = ','.join(multi_winner_l)
-                                    break
-
+                            if d.get('role_name', '') == 'win_tenderer' and d.get('role_text', '') == multi_winner_l[0]:
+                                d['multi_winner'] = ','.join(set(multi_winner_l))
+                                break
         if len(finalists)>=2:
             for project in prem[0].values():
                 if not isinstance(project, dict):
@@ -3994,10 +4014,21 @@ def update_prem(old_prem, new_prem):
                     for d2 in v['roleList']:
                         if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
                             old_prem[k]['roleList'].append(d2)
-        if (len(new_prem)>1 or 'Project' not in new_prem) and 'Project' in old_prem and 'win_tenderer' in str(new_prem): # 表格提取到中标人的,去掉project包中标人
-            for d in old_prem['Project']['roleList']:
-                if d['role_name'] in ['win_tenderer', 'second_tenderer', 'third_tenderer']:
-                    old_prem['Project']['roleList'].remove(d) # 提取到其他包,去掉 project 里面的中标角色
+        if len(old_prem)>1 and 'Project' in old_prem and 'win_tenderer' in str(new_prem): # 表格提取到中标人的,去掉project包中标人
+            pro_winner = set()
+            other_winner = set()
+            for k in old_prem:
+                for d in old_prem[k]['roleList']:
+                    if d['role_name'] in ['win_tenderer', 'pre_win_tenderer']:
+                        if k == 'Project':
+                            pro_winner.add(d['role_text'])
+                        else:
+                            other_winner.add(d['role_text'])
+            if pro_winner & other_winner != set():
+                # print('过滤掉多包相同中标人在不同包')
+                for d in old_prem['Project']['roleList']:
+                    if d['role_name'] in ['win_tenderer', 'second_tenderer', 'third_tenderer']:
+                        old_prem['Project']['roleList'].remove(d) # 提取到其他包,去掉 project 里面的中标角色
         if multi_tendereeMoney and 'Project' in old_prem and float(old_prem['Project']['tendereeMoney'])!=0: # 表格提取到多标段招标金额,去掉Project包招标金额
             old_prem['Project']['tendereeMoney'] = 0
 

BIN
BiddingKG/dl/interface/header_set.pkl


+ 6 - 1
BiddingKG/dl/interface/modelFactory.py

@@ -95,8 +95,13 @@ class Model_role_classify_word():
         text = re.sub('[【(\[][0-9]{2,}[\])】]|\d+([::.-]\d+)+', 'd', text)
         text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)
         text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text)
-        text = re.sub('序号:\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、', '', text)
+        text = re.sub('序号:\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、|([^\w]|^)序:?\d+', '  d', text) # ,序:1,单位名称:
         text = re.sub('(中标|成交|中选|入围)(工程|项目)', '工程', text)  # 修复易错分为中标人
+        text = re.sub('约定', '  ', text) # 修复 233233636 错分为中标人 国有产权网上竞价有关约定 辽阳市公共资源交易中心 ,标
+        text = re.sub('中介机构', '投标机构', text) # 251058999 错分为中标人 序号:2,中介机构名称:
+        text = re.sub('(采购|招标)人名称、地址和联系方式:', '采购人:', text) # 275065998
+        if re.search('(最终)?排名:', text) and re.search('(最终)?排名:第?[123一二三]', text)==None:
+            text = re.sub('(最终)?排名:', '    ', text)
         # text = re.sub('(采购|招标|发布)机构', '发布人', text)
         return text.replace('(', '(').replace(')', ')').replace('單', '单').replace('稱','承').replace('標', '标').replace('採購', '采购').replace('機構', '机构')
 

+ 137 - 73
BiddingKG/dl/interface/predictor.py

@@ -40,6 +40,12 @@ file = os.path.dirname(__file__) + '/agency_set.pkl'
 with open(file, 'rb') as f:
     agency_set = pickle.load(f)
 
+def is_agency(entity_text):
+    if re.search('(招投?标|采购|代理|咨询|管理|物资|事务所?|顾问|监理|拍卖)[()\w]{,4}(有限)?(责任)?公司|(采购|招投?标|交易|代理|咨询)[()\w]{,4}(中心|服务所)|法院$',
+                 entity_text) or entity_text in agency_set:
+        return True
+    return False
+
 from threading import RLock
 dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
               "prem":{"predictor":None,"Lock":RLock()},
@@ -794,13 +800,13 @@ class PREMPredict():
             elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
                 label = 5
             elif label == 0:
-                if re.search('拟邀请$|受邀谈判方', front):
+                if re.search('拟邀请$|受邀谈判方|直购企业:$', front):
                     label = 2
                     values[label] = 0.501
-                elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?|(采购|招标|发布)机构)(名称)?[是为:]+', front) and re.search('(招标|采购|咨询|代理|管理)\w*公司|(采购|交易)(中心|市场)', entity.entity_text):
+                elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?|(采购|招标|发布)机构)(名称)?[是为:]+', front) and is_agency(entity.entity_text):
                     label = 1
                     values[label] = 0.501
-                elif re.search('采用$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-
+                elif re.search('采用$|异议受理部门', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-
                     label = 5
                 elif re.search(',单位名称:$', front) and re.search('^,(中标|中选)价格', behind):
                     label = 2
@@ -824,14 +830,14 @@ class PREMPredict():
                 elif re.search('税费', front) and re.search('^承担', behind):
                     label = 5
                 elif re.search('第一候补|第一后备|备选', front):
-                    label = 2
+                    label = 3
                     values[label] = 0.6
                 elif re.search('放弃中标资格$|是否中标:否|^(中标|成交)(公示|公告)', behind):
                     values[2] = 0.5
                     label = 5
-                elif re.search('(承包权人|帐户名称):$', front):
+                elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$', front):  # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位:
                     label = 5
-                elif re.search('合同供方:?$', front):
+                elif re.search('合同供方:?$|合同签约单位', front):
                     label = 0
                     values[label] = 0.5
             elif re.search('是否中标:是,供应商', front) and label == 5:
@@ -849,11 +855,13 @@ class PREMPredict():
                     values[label] = 0.501
                 elif re.search('^:受', behind):  # 354009560 附件格式问题 ,中选中介服务机构通知书,编号:HZ2305120541,中汕项目管理有限公司:受惠东县人民政府大岭街道办事处委托
                     label = 5
-                elif re.search('发布机构', front) and re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站|馆)$', entity.entity_text):
+                elif re.search('发布机构', front) and not is_agency(entity.entity_text):
                     label = 0
                     values[label] = 0.501
                 elif re.search('开户银行:$', front): # 368214232 法定代表人:委托代理人:开户银行:鸡东建行
                     label = 5
+                elif re.search('委托$', front) and re.search('^(抽样|送检|看样)', behind):
+                    label = 5
             elif label in [3,4]:
                 if re.search('第[二三]分(公司|店),中标(人|供应商|单位|公司):$', front):
                     label = 2
@@ -922,7 +930,7 @@ class PREMPredict():
                 label = 2
             elif label == 1: # 错误中标金额处理
                 if re.search('[::,。](总金额|总价|单价|合价)((万?元))?:?$', front) and re.search('(中标|投标|成交|中价)', front)==None:
-                    values[label] = 0.49
+                    values[label] = 0.5
                 elif re.search('[\+=]((中标|成交)(金?额|价格?)|[若如]果?(中标|成交)(金?额|价格?)为?', front): # 处理例如 241561780 如中标金额为 500-1000万元,则代理服务费=100 万元×0.5%+400万元×0.35%+(中标金额-500)万元
                     values[label] = 0.49
                 elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+', behind):
@@ -1376,7 +1384,7 @@ class RoleRulePredictor():
                                            "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$)"  # 解决表头识别不到加逗号情况,需前面为,。空
         self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)" \
                                            "(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$" \
-                                           "|结果公示如下:摇出球号:\d+号,中介机构:$)"  # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系  # 中标候选人不能作为中标
+                                           "|结果公示如下:摇出球号:\d+号,中介机构:$|直购企业:$)"  # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系  # 中标候选人不能作为中标
 
         self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商)))|" \
                                          "^((报价|价格)最低,|以\w{5,10}|\w{,20})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
@@ -1423,7 +1431,7 @@ class RoleRulePredictor():
         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
         
         self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?为\w{2,4}资金|采购成本价")  # |建安费用 不作为招标金额
-        self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):")  # 单写 总价 不能作为中标金额,很多表格有单价、总价
+        self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(综合)?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):")  # 单写 总价 不能作为中标金额,很多表格有单价、总价
         self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
         self.pattern_money_other = re.compile("代理费|服务费")
         self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
@@ -1451,8 +1459,9 @@ class RoleRulePredictor():
                         if _role == "tendereeORagency":  # 2022/3/9 新增不确定招标代理判断逻辑
                             # print('p_entity_sentenceindex:', p_entity.sentence_index)
 
-                            if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', entity_text) \
-                                    or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', entity_text) == None:
+                            # if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', entity_text) \
+                            #         or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', entity_text) == None:
+                            if is_agency(entity_text):
                                 _role = 'tenderee'
                             else:
                                 _role = "agency"
@@ -1586,7 +1595,8 @@ class RoleRulePredictor():
                                             break
                                         if str(_span[0][-len(str(_name)):]+_span[1] + _span[2][:len(str(_name))]).find(
                                                 _name) >= 0:
-                                            if p_entity.entity_text in agency_set or re.search('(代理|管理|咨询|招投?标|采购)\w{,6}公司', p_entity.entity_text): # 在代理人集合的作为代理人
+                                            # if p_entity.entity_text in agency_set or re.search('(代理|管理|咨询|招投?标|采购)\w{,6}公司', p_entity.entity_text): # 在代理人集合的作为代理人
+                                            if is_agency(p_entity.entity_text): # 2024/3/29 统一方法判断是否为代理
                                                 find_flag = True
                                                 _label = 1
                                                 p_entity.label = _label
@@ -1819,8 +1829,11 @@ class RoleRuleFinalAdd():
         :param list_codenames:
         :return:
         '''
+
         # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
         main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
+        if len(list_sentences[0])>0 and list_sentences[0][-1].in_attachment:
+            main_sentences = list_sentences[0][-1:] + main_sentences[-2:]
         if len(main_sentences)==0:
             return 0
         # end_tokens = []
@@ -1834,7 +1847,19 @@ class RoleRuleFinalAdd():
             # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
             sear_ent = re.search('([,。;]|^)(?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
             if sear_ent:
+                b, e = sear_ent.span()
+                if re.search('报价记录|竞价成交', text_end[max(b-10, 0):b] + text_end[e:]):
+                    sear_ent = None
                 break
+        if sear_ent == None:
+            text_end = list_articles[0].content[-100:]
+            sear_ent = re.search(
+                '([,。;]|^)(?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?',
+                text_end)
+            if sear_ent:
+                b, e = sear_ent.span()
+                if re.search('报价记录|竞价成交', text_end[max(b-10, 0):b] + text_end[e:]):
+                    sear_ent = None
         sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
         sear_ent2 = re.search('[,:](户名|开户名称|发票抬头|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
         if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('投标报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
@@ -1850,14 +1875,14 @@ class RoleRuleFinalAdd():
         ents = []
         for ent in list_entitys[0]:
             if ent.entity_type in ['org', 'company']:
-                if ent.label == 0 and ent.values[ent.label]>0.5:
+                if ent.label == 0 and ent.values[ent.label]>0.55:
                     if '公共资源交易中心' in ent.entity_text:  # 公共资源交易中心不算招标或代理,只算平台
                         # ent.label = 5
                         ent.values[ent.label] = 0.6 if ent.values[ent.label]>0.6 else 0.5 # 改为降低概率,不改类别,防止 382573066 明显招标人表达不提取
                         continue
                     tenderee_list.append(ent.entity_text)
                     tenderee_notfound = False
-                elif ent.label == 1:
+                elif ent.label == 1 and ent.values[ent.label]>0.55:
                     agency_list.append(ent.entity_text)
                     agency_notfound = False
                 elif ent.label == 5:
@@ -1869,33 +1894,24 @@ class RoleRuleFinalAdd():
                 ent_re = _sear_ent.group('entity')
                 ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
 
-                if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站)$', ent_re)
-                                                  or re.search('(招投?标|采购|代理|咨询|管理)(服务)?(有限)?(责任)?公司|(采购|招投?标|交易|代理)(服务)?中心|(招标|代理|咨询|管理|监理)', ent_re) == None) \
-                        and ent_re not in agency_list and ent_re not in agency_set:
+                if tenderee_notfound or agency_notfound:
                     n = 0
                     for i in range(len(ents) - 1, -1, -1):
                         if not ents[i].in_attachment:
                             n += 1
                         if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
                             break
-                        if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and re.search('(大学|中学|小学|幼儿园|医院)$', ents[i].entity_text)) or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
-                            ents[i].label = 0
-                            ents[i].values[0] = 0.51 # 修改为比标题概率略高
-                            tenderee_notfound = False
-                            # log('正则最后补充实体: %s'%(ent_re))
-                            break
-                elif agency_notfound == True and ent_re not in tenderee_list and (
-                        re.search('(招投?标|采购|代理|咨询|管理)(服务)?(有限)?(责任)?公司|(采购|招投?标|交易|代理)(服务)?中心|(招标|代理|咨询|管理|监理)', ent_re) or ent_re in agency_set):
-                    n = 0
-                    for i in range(len(ents) - 1, -1, -1):
-                        if not ents[i].in_attachment:
-                            n += 1
-                        if n > 3 and _sear_ent==sear_ent:  # 文章末尾角色加日期这种只找后三个实体
+                        elif _sear_ent==sear_ent and ents[i].label != 5:  # 后面有角色的实体的停止继续往前
                             break
-                        if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
-                            ents[i].label = 1
-                            ents[i].values[1] = 0.51 # 修改为比标题概率略高
-                            agency_notfound = False
+                        if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and re.search('(大学|中学|小学|幼儿园|医院)$', ents[i].entity_text)) or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
+                            if agency_notfound and is_agency(ents[i].entity_text) and ents[i].entity_text not in tenderee_list:
+                                ents[i].label = 1
+                                ents[i].values[1] = 0.51 # 修改为比标题概率略高
+                                agency_notfound = False
+                            elif tenderee_notfound and not is_agency(ents[i].entity_text) and ents[i].entity_text not in agency_list:
+                                ents[i].label = 0
+                                ents[i].values[0] = 0.51 # 修改为比标题概率略高
+                                tenderee_notfound = False
                             # log('正则最后补充实体: %s'%(ent_re))
                             break
                     if not tenderee_notfound:
@@ -2191,7 +2207,7 @@ class RoleGrade():
         self.tenderee_left_5 = "(?P<tenderee_left_5>(发布)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|买方|发布机构)"
         self.agency_left_9 = "(?P<agency_left_9>代理)"
         self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]名|排[名序]:1|名次:1)"
-        self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方))"
+        self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方|最[终后]选[择取]))"  # 229435497 最后选择西平,县中原彩印有限公司,作为此项目中标供应商,
         self.winTenderer_left_6 = "(?P<winTenderer_left_6>(入围|承[接建包修做制担租销]))"
         self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]:2|名次:2))"
         self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]:3|名次:3))"
@@ -2212,6 +2228,10 @@ class RoleGrade():
         org_winner = []
         company_winner = []
         org_tenderee = []
+        agency_l = []
+        agency_like_tenderee = [] # 类似招标人的代理人实体列表
+        low_prob_agency = []
+        low_prob_tenderee = []
         for entity in list_entitys[0]:
             if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> min_prob:
                 text = sentences[entity.sentence_index].sentence_text
@@ -2220,6 +2240,16 @@ class RoleGrade():
                 b = entity.wordOffset_begin
                 e = entity.wordOffset_end
                 not_found = 1
+                if re.search('(乙方:甲方:|甲方:乙方:)$', text[max(0, b-span):b]):
+                    entity.label = 0 if entity.entity_type == 'org' else 2
+                    entity.values[entity.label] = 0.55
+                    continue
+                elif re.search('(采购|招标)人(?或(采购|招标)?代理机构)?:$', text[max(0, b-span):b]):
+                    entity.label = 1 if is_agency(entity.entity_text) else 0
+                    entity.values[entity.label] = 0.8
+                    continue
+                elif re.search('(采购|招标|询比?价|遴选|寻源|比选)机构[是为:]+', text[max(0, b-span):b]) and not is_agency(entity.entity_text):
+                    agency_like_tenderee.append(entity)
                 for pattern in self.pattern_list:
                     if 'left' in pattern:
                         context = text[max(0, b-span):b]
@@ -2262,10 +2292,31 @@ class RoleGrade():
                         company_winner.append(entity)  # 保存中标人实体
                 if entity.label == 0 and entity.values[entity.label]> min_prob:
                     org_tenderee.append(entity.entity_text)  # 保存所有招标人名称
-            if entity.entity_type in ['org', 'company'] and entity.label == 0 and entity.entity_text in agency_set and entity.values[entity.label]<0.6:  # 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
-                # log('修改概率小于0.6的且在大数据代理集合里面的招标人为代理人%s:'%entity.entity_text)
+                elif entity.label == 1 and entity.values[entity.label]> min_prob:
+                    agency_l.append(entity.entity_text)
+            # if entity.entity_type in ['org', 'company'] and entity.label == 0 and entity.entity_text in agency_set and entity.values[entity.label]<0.6:  # 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
+            #     # log('修改概率小于0.6的且在大数据代理集合里面的招标人为代理人%s:'%entity.entity_text)
+            #     entity.label = 1
+            #     entity.values[entity.label] = 0.5
+
+            elif entity.entity_type in ['org', 'company'] and entity.label in [1, 0] and 0.5<=entity.values[entity.label]<0.6:
+                if entity.label == 1:
+                    low_prob_agency.append(entity)
+                else:
+                    low_prob_tenderee.append(entity)
+
+        if org_tenderee == [] and agency_like_tenderee:
+            for entity in agency_like_tenderee:
+                entity.label = 0
+                entity.values[entity.label] = 0.6
+        for entity in low_prob_agency:
+            if entity.entity_text in org_tenderee:
+                entity.label = 0
+                entity.values[entity.label] = 0.6
+        for entity in low_prob_tenderee:
+            if entity.entity_text in agency_l:
                 entity.label = 1
-                entity.values[entity.label] = 0.5
+                entity.values[entity.label] = 0.6
 
         if org_winner != []:
             flag = 0
@@ -5874,8 +5925,10 @@ class TableTag2List():
                         if text_process != None:
                             # text = [re.sub('\xa0', '', text_process(cell, final=False)), 0]
                             # td_text = re.sub('\xa0', '', text_process(cell, final=False))
-                            td_text = re.sub('\s|\xa0', '', str(cell.get_text())) # 修复 370835008 td 内公司被p标签拆分为两半情况
-                            if len(td_text)>30:
+                            td_text = re.sub('\s|\xa0', '', str(cell.get_text()))  # 修复 370835008 td 内公司被p标签拆分为两半情况
+                            if 'title' in cell.attrs and cell.get_text().strip().endswith('...') and cell.get_text().strip()[:-3] in cell.attrs['title']:
+                                td_text = cell.attrs['title']  # 修复 类似 215597851 省略号隐藏内容
+                            elif len(td_text)>30:
                                 td_text = re.sub('\xa0', '', text_process(cell, final=False))
                             if td_text == "":
                                 td_text = ' '
@@ -5953,7 +6006,7 @@ class TablePremExtractor(object):
 
 
     def find_header(self, td_list):
-        fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟', '', it) for it in td_list]  # 去除表头无关信息,方便匹配判断是否为表头
+        fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟|\s', '', it) for it in td_list]  # 去除表头无关信息,方便匹配判断是否为表头
         header_dic = dict()
         flag = False
         contain_header = False
@@ -6059,14 +6112,14 @@ class TablePremExtractor(object):
             return {}
         for i in df.index:
             same_package = False  # 连续重复包号,一般是 rowspan 造成;一包 多个采购
-            project_code = df.loc[i, headers['project_code'][0]] if "project_code" in headers else ""
-            package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
-            project_name = df.loc[i, headers['project_name'][0]] if "project_name" in headers else ""
-            tenderee = df.loc[i, headers['tenderee'][0]] if "tenderee" in headers else ""
-            tenderer = df.loc[i, headers['tenderer'][0]] if "tenderer" in headers else ""
-            budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
-            bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
-            win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
+            project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
+            package_code_raw = df.loc[i, headers['package_code'][0]].strip() if "package_code" in headers else ""
+            project_name = df.loc[i, headers['project_name'][0]].strip() if "project_name" in headers else ""
+            tenderee = df.loc[i, headers['tenderee'][0]].strip() if "tenderee" in headers else ""
+            tenderer = df.loc[i, headers['tenderer'][0]].strip() if "tenderer" in headers else ""
+            budget_ = df.loc[i, headers['budget'][0]].strip() if "budget" in headers else ""
+            bid_amount_ = df.loc[i, headers['bid_amount'][0]].strip() if "bid_amount" in headers else ""
+            win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
 
             if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配
                 # print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
@@ -6090,7 +6143,7 @@ class TablePremExtractor(object):
 
             if win_sort != "" and re.search('排名|排序|名次|推荐顺序', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取  防止类似 328485591 作为多包
                 break
-            if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and re.search('否|未(中标|成交|中选)', win_sort):
+            if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and (re.search('否|未(中标|成交|中选)', win_sort) or win_sort==''): # 2024/04/2 修复 252208201 为空的不中标
                 continue
             if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
                 continue
@@ -6303,7 +6356,7 @@ class CandidateExtractor(object):
             self.headerset = pickle.load(f)
 
     def find_header(self, td_list):
-        fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
+        fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟|\s', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
         header_dic = dict()
         flag = False
         contain_header = False
@@ -6384,23 +6437,28 @@ class CandidateExtractor(object):
         findtop3 = False
         findmoney = False
         line_num = 0
+        line_package = None
         for i in df.index:
-            package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
-            candidate_ = df.loc[i, headers['candidate'][0]] if "candidate" in headers else ""
-            win_or_not = df.loc[i, headers['win_or_not'][0]] if "win_or_not" in headers else ""
+            package_code_raw = df.loc[i, headers['package_code'][0]].strip() if "package_code" in headers else ""
+            candidate_ = df.loc[i, headers['candidate'][0]].strip() if "candidate" in headers else ""
+            win_or_not = df.loc[i, headers['win_or_not'][0]].strip() if "win_or_not" in headers else ""
             # budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
-            bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
-            win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
-            win_tenderer = df.loc[i, headers['win_tenderer'][0]] if "win_tenderer" in headers else ""
-            second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
-            third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
+            bid_amount_ = df.loc[i, headers['bid_amount'][0]].strip() if "bid_amount" in headers else ""
+            win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
+            win_tenderer = df.loc[i, headers['win_tenderer'][0]].strip() if "win_tenderer" in headers else ""
+            second_tenderer = df.loc[i, headers['second_tenderer'][0]].strip() if "second_tenderer" in headers else ""
+            third_tenderer = df.loc[i, headers['third_tenderer'][0]].strip() if "third_tenderer" in headers else ""
 
             if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配 # 排除 ,win_sort 避免367940050漏提取
                 # print('包含表头, 停止匹配')
                 break
             if len(set([package_code_raw, candidate_,win_sort, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2:  # 全部为空或内容一样 停止匹配
                 # print('全部为空或内容一样 停止匹配')
-                break
+                if len(set(df.loc[i,:]))==1 and re.search('^第?([一二三四五六七八九十]{1,3}|[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段])([一二三四五六七八九十]{1,3}|[a-zA-Z0-9-]{,9})?$', win_sort):
+                    line_package = win_sort
+                    continue
+                else:
+                    break
 
             if candidate_ != "" and win_sort == "" and headers['candidate'][0] > 0: # 修复某些表头不说 排名,直接用候选人代替
                 col_indx = headers['candidate'][0] -1
@@ -6411,6 +6469,8 @@ class CandidateExtractor(object):
                     win_sort = pre_col
 
             package_code = package_code_raw
+            if package_code == '' and line_package:
+                package_code = line_package
 
             # candidate = candidate_ if self.is_role(candidate_) else ""
             # tenderer = tenderer if self.is_role(tenderer) else ""
@@ -6637,6 +6697,10 @@ class WebsourceTenderee():
         '''
         p = '(医院|学院|学校|中学|小学|大学|幼儿园|保健院|党校|银行|研究院|血站|红十字会|防治院|研究所)'
         web_ree = self.webno2ree.get(web_source_no, '')
+        if web_source_no.startswith('18591-') and web_ree == "":
+            web_ree = '中国人民解放军总医院'
+        elif web_source_no.startswith('Y00484-') and web_ree == "":
+            web_ree = '航空总医院'
         if web_ree != '':
             if 'Project' in prem[0]['prem']:
                 find_tenderee = False
@@ -7035,18 +7099,18 @@ if __name__=="__main__":
     # rs = product_attr.predict(docid='', html=html, page_time="")
     # print(rs)
 
-    docid = ""
-    title = ''
-    with open('d:/html/2.html', 'r', encoding='utf-8') as f:
-        html = f.read()
-    tb_extract = TablePremExtractor()
-    rs = tb_extract.predict(html, [
-        "广东省广裕集团嘉顺实业有限责任公司",
-        "广州顺为招标采购有限公司",
-        "中华人民共和国"
-    ], web_source_name = '河钢供应链管理平台')
-    print('标段数:',len(rs))
-    print(rs)
+    # docid = ""
+    # title = ''
+    # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
+    #     html = f.read()
+    # tb_extract = TablePremExtractor()
+    # rs = tb_extract.predict(html, [
+    #     "广东省广裕集团嘉顺实业有限责任公司",
+    #     "广州顺为招标采购有限公司",
+    #     "中华人民共和国"
+    # ], web_source_name = '河钢供应链管理平台')
+    # print('标段数:',len(rs))
+    # print(rs)
 
     # # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
     # # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]