1 vuosi sitten · c2050e31ff
--- a/BiddingKG/dl/entityLink/entityLink.py
+++ b/BiddingKG/dl/entityLink/entityLink.py
@@ -168,7 +168,7 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
 
				                     if have_bus:
			
 
				                         lb, prob = get_role(dic)
			
 
				                         bus_dic[_entity.entity_text] = (lb, prob)
			
 
				-                        if lb == 0 and prob > 0.9 and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']:
			
 
				+                        if lb == 0 and prob > 0.9 and re.search('医院|学院|学校|中学|小学|大学|中心|幼儿园|保健院|党校|银行|研究院|血站|分校|红十字会|防治院|研究所', _entity.entity_text) and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']:
			
 
				                             bus_tenderee.append(_entity)
			
 
				                 if have_bus: # 20231115 改为只判断是否有工商数据，没有就考虑替换
			
 
				                     long_entity.append(_entity)
			
--- a/BiddingKG/dl/interface/Entitys.py
+++ b/BiddingKG/dl/interface/Entitys.py
@@ -258,6 +258,7 @@ class PREM():
 
				         self.money = money
			
 
				         self.money_prob = money_prob
			
 
				         self.linklist = linklist
			
 
				+        self.multi_winner = set() # 2024/4/8 #添加多中标人
			
 
				         
			
 
				     def getString(self,roleList):
			
 
				         '''
			
@@ -285,7 +286,7 @@ class Role():
 
				     @summary: 定义一个角色拥有的所有属性
			
 
				     '''
			
 
				     
			
 
				-    def __init__(self,role_name,entity_text,role_prob,money,money_prob,linklist):
			
 
				+    def __init__(self,role_name,entity_text,role_prob,money,money_prob,linklist, multi_winner):
			
 
				         
			
 
				         self.role_name = role_name
			
 
				         self.entity_text = entity_text
			
@@ -298,6 +299,7 @@ class Role():
 
				         self.ratio = None #2022/01/06 新增 保存中投标金额相关费率 (ratio_value,ratio_type)
			
 
				         self.serviceTime = "" #2021/01/06 新增 保存服务期限(工期)
			
 
				         self.address = ""  #2022/08/08 新增 角色地址
			
 
				+        self.multi_winner = multi_winner #2024/4/8 新增多中标人
			
 
				 
			
 
				     def getString(self):
			
 
				         self.linklist = [item for item in set(self.linklist)]
			
@@ -342,6 +344,9 @@ class Role():
 
				                   'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}
			
 
				         if result['role_name'] == 'tenderee':
			
 
				             result['role_prob'] = self.role_prob
			
 
				+        if result['role_name'] == 'win_tenderer' and self.multi_winner != set():
			
 
				+            self.multi_winner.add(result['role_text'])
			
 
				+            result['multi_winner'] = ','.join(self.multi_winner)
			
 
				         return result
			
 
				 
			
 
				 # 用于KM算法的组合配对
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -794,7 +794,7 @@ def tableToText(soup):
 
				     def getTableText(inner_table,head_list,key_direct=False):
			
 
				         # packPattern = "(标包|[标包][号段名])"
			
 
				         packPattern = "(标包|标的|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则，补充采购类包名
			
 
				-        rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标|推荐意见|评标情况|推荐顺序)"  # 2020/11/23 大网站规则，添加序号为排序
			
 
				+        rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标|推荐意见|评标情况|推荐顺序|选取(情况|说明))"  # 2020/11/23 大网站规则，添加序号为排序
			
 
				         entityPattern = "((候选|[中投]标|报价)(单位|公司|人|供应商))|供应商名称"
			
 
				         moneyPattern = "([中投]标|报价)(金额|价)"
			
 
				         height = len(inner_table)
			
@@ -901,6 +901,8 @@ def tableToText(soup):
 
				                         money_text = ""
			
 
				                         #在同一句话中重复的可以去掉
			
 
				                         text_set = set()
			
 
				+                        head = ""
			
 
				+                        last_text = ""
			
 
				                         for j in range(width):
			
 
				                             cell = table_occurence[i][j]
			
 
				                             if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
			
@@ -928,9 +930,14 @@ def tableToText(soup):
 
				                                     else:
			
 
				                                         text_line += head+cell["text"]+"，"
			
 
				                                 text_set.add(str(head+cell["text"]))
			
 
				+                                last_text = cell['text']
			
 
				 
			
 
				                         text += pack_text+rank_text+entity_text+money_text+text_line
			
 
				-                        text = text[:-1]+"。" if len(text)>0 else text
			
 
				+                        # text = text[:-1] + "。" if len(text) > 0 else text
			
 
				+                        if len(text_set)==1 and head == '' and len(last_text)< 20 and (re.search('[:：]$', last_text) or re.search('[一二三四五六七八九十\d]+[、.]\w{2,}', last_text)):
			
 
				+                            text = text if re.search('\w$', text[:-1]) else text[:-1]
			
 
				+                        else:
			
 
				+                            text = text[:-1] + "。"
			
 
				 
			
 
				                 else:
			
 
				                     for j in range(occu_width):
			
@@ -2537,17 +2544,13 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
 
				     entity_type = "money"
			
 
				     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
			
 
				                           "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价)(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[）\)]?))",
			
 
				-                          "front_m": "((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)]?)\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
			
 
				+                          "front_m": "((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)]?)\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
			
 
				                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
			
 
				-    # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。
			
 
				-
			
 
				-    # pattern_money = re.compile("%s|%s|%s|%s" % (
			
 
				-    # list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
			
 
				-    # list_money_pattern["front_m"]))
			
 
				+    # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元，合同金额：378.8万元 提取
			
 
				 
			
 
				-    pattern_money = re.compile("%s|%s|%s" % (
			
 
				-        list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"]
			
 
				-        )) # 2024/01/30  改为 list_money_pattern["front_m"] 单独搜索，避免与 key_word 冲突  详见合同元，合同金额：378.8万元，
			
 
				+    pattern_money = re.compile("%s|%s|%s|%s" % (
			
 
				+    list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
			
 
				+    list_money_pattern["front_m"]))
			
 
				 
			
 
				     if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
			
 
				         found_yeji += 1
			
@@ -2558,8 +2561,6 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
 
				         if ser:
			
 
				             sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
			
 
				         all_match = re.finditer(pattern_money, sentence_text)
			
 
				-        if all_match == None:
			
 
				-            all_match = re.finditer(list_money_pattern["front_m"], sentence_text)
			
 
				     # print('all_match:', all_match)
			
 
				     for _match in all_match:
			
 
				         # print('_match: ', _match.group())
			
@@ -2867,10 +2868,10 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                 elif entity_type=="person" and len(entity_text)>10 and len(re.findall("[\u4e00-\u9fa5]",entity_text))<len(entity_text)/2:
			
 
				                     continue
			
 
				                 # 识别不完整的组织机构补充
			
 
				-                if entity_type in ["org"]:
			
 
				-                    end_words = re.search("^[\u4e00-\u9fa5]{,5}(?:办公室|部|中心|处|会)",sentence_text[end_index_temp:end_index_temp+10])
			
 
				-                    if end_words:
			
 
				-                        entity_text = entity_text + end_words.group()
			
 
				+                # if entity_type in ["org"]:
			
 
				+                #     end_words = re.search("^[\u4e00-\u9fa5]{,5}(?:办公室|部|中心|处|会)",sentence_text[end_index_temp:end_index_temp+10])  # 2024/4/7 注释掉 273356356 江门市新会区大鳌镇农村集体资产资源交易中心受新会
			
 
				+                #     if end_words:
			
 
				+                #         entity_text = entity_text + end_words.group()
			
 
				 
			
 
				                 for j in range(len(list_tokenbegin)):
			
 
				                     if list_tokenbegin[j]==begin_index_temp:
			
@@ -2888,7 +2889,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				 
			
 
				                 #去掉标点符号
			
 
				                 if entity_type!='time':
			
 
				-                    entity_text = re.sub("[,，。：!&@$\*\s]","",entity_text)
			
 
				+                    entity_text = re.sub("[,，。：!&@$\*\s;；]","",entity_text) # 215553737
			
 
				                 entity_text = entity_text.replace("(","（").replace(")","）") if isinstance(entity_text,str) else entity_text
			
 
				                 # 组织机构实体名称补充
			
 
				                 if entity_type in ["org", "company"]:
			
--- a/BiddingKG/dl/interface/agency_set.pkl
+++ b/BiddingKG/dl/interface/agency_set.pkl
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -349,7 +349,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2024-03-28'}
			
 
				+    version_date = {'version_date': '2024-04-16'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
			
 
				 
			
 
				     '''最终检查修正招标、中标金额'''
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -311,7 +311,7 @@ def get_legal_comba(list_entity,dict_role_combination):
 
				                 _dict = dict()
			
 
				                 for _key in item.keys():
			
 
				                     _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
			
 
				-                _prob, max_role_prob = getSumExpectation(dict_pack_entity_prob, _dict)
			
 
				+                _prob = getSumExpectation(dict_pack_entity_prob, _dict)
			
 
				                 if _prob>MAX_PROB:
			
 
				                     MAX_PROB = _prob
			
 
				                     _MAX_PROB_COMBA = [item]
			
@@ -396,14 +396,11 @@ def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
 
				     '''
			
 
				     #修改为同一个实体只取对应包-角色的最大的概率值
			
 
				     expect = 0
			
 
				-    max_prob = 0
			
 
				     dict_entity_prob = {}
			
 
				     for _key_pack_entity in dict_pack_entity_prob:
			
 
				         _key_pack = _key_pack_entity.split("$text$")[0]
			
 
				         role_prob = dict_pack_entity_prob[_key_pack_entity][1]
			
 
				         if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
			
 
				-            if role_prob > max_prob:
			
 
				-                max_prob = role_prob
			
 
				             if _key_pack_entity in dict_entity_prob.keys():
			
 
				                 if dict_entity_prob[_key_pack_entity]<role_prob:
			
 
				                     dict_entity_prob[_key_pack_entity] = role_prob
			
@@ -438,7 +435,7 @@ def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
 
				     for _key in dict_entity_prob.keys():
			
 
				         symbol = 1 if dict_entity_prob[_key]>0 else -1 
			
 
				         expect += symbol*math.pow(dict_entity_prob[_key],2)
			
 
				-    return expect, max_prob
			
 
				+    return expect
			
 
				 
			
 
				 
			
 
				 def getRoleList(list_sentence,list_entity,on_value = 0.5):
			
@@ -458,6 +455,8 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
 
				 
			
 
				     #拿到所有可能的情况
			
 
				     dict_role_combination = {}
			
 
				+    tenderee_or_agency_set = set() # 记录所有预测为招标或代理的实体集合
			
 
				+    win_tenderer_set = set() # 记录所有预测为中标的实体集合
			
 
				   # print(PackageList)
			
 
				     #拿到各个实体的packageName,packageCode
			
 
				     for entity in list_entity:
			
@@ -520,9 +519,7 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
 
				     _index = 0
			
 
				     dict_pack_entity_prob = get_dict_entity_prob(list_entity)
			
 
				     for item_combination in list_real_comba:
			
 
				-        expect, max_role_prob = getSumExpectation(dict_pack_entity_prob, item_combination)
			
 
				-        for k, v in item_combination.items():
			
 
				-            item_combination[k] = [v, max_role_prob]
			
 
				+        expect = getSumExpectation(dict_pack_entity_prob, item_combination)
			
 
				         if expect>max_expect:
			
 
				             max_index = _index
			
 
				             max_expect = expect
			
@@ -534,14 +531,30 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
 
				             packageName = _key.split("$$")[0]
			
 
				             label = _key.split("$$")[1]
			
 
				             role_name = dict_role_id.get(str(label))
			
 
				-            # entity_text = list_real_comba[max_index][_key]
			
 
				-            entity_text = list_real_comba[max_index][_key][0]
			
 
				-            entity_prob = list_real_comba[max_index][_key][1]
			
 
				+            entity_text = list_real_comba[max_index][_key]
			
 
				+            entity_prob = dict_pack_entity_prob.get(_key+'$text$'+entity_text, ['',0])[1]
			
 
				+            # entity_text = list_real_comba[max_index][_key][0]
			
 
				+            # entity_prob = list_real_comba[max_index][_key][1]
			
 
				             if packageName in dict_PackageCode.keys():
			
 
				                 packagecode = dict_PackageCode.get(packageName)
			
 
				             else:
			
 
				                 packagecode = ""
			
 
				             RoleList.append(PREM(packageName,packagecode,role_name,entity_text,entity_prob,0,0.0,[]))
			
 
				+            if str(label) in ["0", "1"]:
			
 
				+                tenderee_or_agency_set.add(entity_text)
			
 
				+            elif str(label) in ["2"] and entity_prob > 0.8:
			
 
				+                win_tenderer_set.add(entity_text)
			
 
				+
			
 
				+            if len(list_real_comba) > 1 and label == '2':
			
 
				+                multi_winner = []
			
 
				+                for comba in list_real_comba:
			
 
				+                    tmp_ent = comba.get(_key, '')
			
 
				+                    tmp_prob = dict_pack_entity_prob.get(_key+'$text$'+tmp_ent, ['',0])[1]
			
 
				+                    if tmp_ent !='' and tmp_prob>0.8:
			
 
				+                        multi_winner.append(comba[_key])
			
 
				+                if len(set(multi_winner)) > 1:
			
 
				+                    RoleList[-1].multi_winner = multi_winner
			
 
				+            # print('RoleList: ', RoleList)
			
 
				             RoleSet.add(entity_text)
			
 
				 
			
 
				     #根据最优树来修正list_entity中角色对包的连接
			
@@ -554,7 +567,7 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
 
				                     _find_flag = True
			
 
				             if not _find_flag:
			
 
				                 _entity.pointer_pack = None
			
 
				-    return RoleList,RoleSet,PackageList,PackageSet
			
 
				+    return RoleList,RoleSet,PackageList,PackageSet,win_tenderer_set,tenderee_or_agency_set
			
 
				 
			
 
				 def getPackageScopePattern():
			
 
				     '''
			
@@ -690,6 +703,9 @@ def getPackagesFromArticle(list_sentence, list_entity):
 
				                 elif re.search('同一(标段?|包)', content[max(0, iter.start()-2):iter.end()]):  # 不得参加同一标段
			
 
				                     # print('过滤掉错误包：', iter.group())
			
 
				                     continue
			
 
				+                elif re.search('三包', content[max(0, iter.start()-2):iter.end()]) and re.search('第三包', content[max(0, iter.start()-2):iter.end()])==None:  # 规规章和“三包”规定
			
 
				+                    # print('过滤掉错误包：', iter.group())
			
 
				+                    continue
			
 
				                 elif re.search('[1-9]\d{2,}$|\d{4,}|^[1-9]\d{2,}|合同包[A-Za-z]{2,}', iter.group(0)):
			
 
				                     # print('过滤掉错误包号5：', iter.group(0))
			
 
				                     continue
			
@@ -2870,7 +2886,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
			
 
				     return PackDict
			
 
				 
			
 
				-def initPackageAttr(RoleList,PackageSet):
			
 
				+def initPackageAttr(RoleList,PackageSet,win_tenderer_set,tenderee_or_agency_set):
			
 
				     '''
			
 
				     @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
			
 
				     '''   
			
@@ -2883,7 +2899,7 @@ def initPackageAttr(RoleList,PackageSet):
 
				             packDict[item.packageName]["code"] = item.packageCode
			
 
				         # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
			
 
				         # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称，实体名称，角色阈值，金额，金额阈值，连接列表，金额单位)
			
 
				-        packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,item.role_prob,0,0.0,[])) #Role(角色名称，实体名称，角色阈值，金额，金额阈值，连接列表，金额单位)
			
 
				+        packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,item.role_prob,0,0.0,[],set(item.multi_winner)-win_tenderer_set-tenderee_or_agency_set)) #Role(角色名称，实体名称，角色阈值，金额，金额阈值，连接列表，多中标人)
			
 
				     return packDict
			
 
				                 
			
 
				 def getPackageRoleMoney(list_sentence,list_entity,list_outline):
			
@@ -2897,12 +2913,14 @@ def getPackageRoleMoney(list_sentence,list_entity,list_outline):
 
				     theRole = getRoleList(list_sentence,list_entity)
			
 
				     if not theRole:
			
 
				         return []
			
 
				-    RoleList,RoleSet,PackageList,PackageSet = theRole
			
 
				+    # RoleList,RoleSet,PackageList,PackageSet = theRole
			
 
				+    RoleList,RoleSet,PackageList,PackageSet,win_tenderer_set,tenderee_or_agency_set = theRole
			
 
				     '''
			
 
				     for item in PackageList:
			
 
				         # print(item)
			
 
				     '''
			
 
				-    PackDict = initPackageAttr(RoleList, PackageSet)
			
 
				+    # PackDict = initPackageAttr(RoleList, PackageSet)
			
 
				+    PackDict = initPackageAttr(RoleList, PackageSet, win_tenderer_set,tenderee_or_agency_set)
			
 
				 
			
 
				     PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline)
			
 
				     return PackDict
			
@@ -3864,7 +3882,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
 
				     '''
			
 
				     moneys = []
			
 
				     moneys_attachment = []
			
 
				-    if channel_dic['docchannel']['docchannel']=='中标信息' and 'win_tenderer' in str(prem):
			
 
				+    if channel_dic['docchannel']['docchannel'] in ['中标信息','候选人公示','合同公告'] and 'win_tenderer' in str(prem):
			
 
				         sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index)
			
 
				         finalists = [] # 入围供应商
			
 
				         i = 0
			
@@ -3872,7 +3890,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
 
				             ent = list_entitys[0][i]
			
 
				             b_idx_fr = ent.wordOffset_begin
			
 
				             e_idx_fr = ent.wordOffset_end
			
 
				-            multi_winner_l = [ent.entity_text]
			
 
				+            multi_winner_l = []
			
 
				             i += 1
			
 
				             if ent.entity_type in ['money']:
			
 
				                 money = float(ent.entity_text)
			
@@ -3880,7 +3898,8 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
 
				                     moneys_attachment.append(money)
			
 
				                 else:
			
 
				                     moneys.append(money)
			
 
				-            if ent.entity_type in ['org', 'company'] and ent.label == 2 and ent.values[ent.label]>0.5:
			
 
				+            if ent.entity_type in ['org', 'company'] and ent.label == 2 and ent.values[ent.label]>0.8:
			
 
				+                multi_winner_l.append(ent.entity_text)
			
 
				                 sentence_text = sentences[ent.sentence_index].sentence_text
			
 
				                 pre_text = sentence_text[max(0, b_idx_fr-10):b_idx_fr]
			
 
				                 if re.search('入围', pre_text) and re.search('未入围', pre_text)==None and ent.entity_text not in finalists:
			
@@ -3891,12 +3910,16 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
 
				                     e_idx_bh = ent_bh.wordOffset_end
			
 
				                     if ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh-e_idx_fr==1:
			
 
				                         sentence_text = sentences[ent_bh.sentence_index].sentence_text
			
 
				-                        if sentence_text[e_idx_fr:b_idx_bh] in ['；','、'] and (len(sentence_text)==e_idx_bh or sentence_text[e_idx_bh] in ['、', '，', '。']): # 修复多中标人刚好在文末index超出报错，例子 407126558
			
 
				+                        if sentence_text[e_idx_fr:b_idx_bh] in ['；','、','&','，'] and (len(sentence_text)==e_idx_bh or sentence_text[e_idx_bh] in ['；','、','&', '，', '。']): # 修复多中标人刚好在文末index超出报错，例子 407126558
			
 
				                             multi_winner_l.append(ent_bh.entity_text)
			
 
				                             e_idx_fr = e_idx_bh
			
 
				                             i = j + 1
			
 
				                         else:
			
 
				                             break
			
 
				+                    elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh==e_idx_fr:
			
 
				+                        multi_winner_l.append(ent_bh.entity_text)
			
 
				+                        e_idx_fr = e_idx_bh
			
 
				+                        i = j + 1
			
 
				                     else:
			
 
				                         break
			
 
				             if len(multi_winner_l)>=2:
			
@@ -3905,12 +3928,9 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
 
				                         continue
			
 
				                     for v in project.values():
			
 
				                         for d in v['roleList']:
			
 
				-                            if d.get('role_name', '') == 'win_tenderer':
			
 
				-                                winner = d.get('role_text')
			
 
				-                                if winner == multi_winner_l[0]:
			
 
				-                                    d['multi_winner'] = ','.join(multi_winner_l)
			
 
				-                                    break
			
 
				-
			
 
				+                            if d.get('role_name', '') == 'win_tenderer' and d.get('role_text', '') == multi_winner_l[0]:
			
 
				+                                d['multi_winner'] = ','.join(set(multi_winner_l))
			
 
				+                                break
			
 
				         if len(finalists)>=2:
			
 
				             for project in prem[0].values():
			
 
				                 if not isinstance(project, dict):
			
@@ -3994,10 +4014,21 @@ def update_prem(old_prem, new_prem):
 
				                     for d2 in v['roleList']:
			
 
				                         if d2 not in tmp_l: # 把新预测有，旧没有的角色添加上去
			
 
				                             old_prem[k]['roleList'].append(d2)
			
 
				-        if (len(new_prem)>1 or 'Project' not in new_prem) and 'Project' in old_prem and 'win_tenderer' in str(new_prem): # 表格提取到中标人的，去掉project包中标人
			
 
				-            for d in old_prem['Project']['roleList']:
			
 
				-                if d['role_name'] in ['win_tenderer', 'second_tenderer', 'third_tenderer']:
			
 
				-                    old_prem['Project']['roleList'].remove(d) # 提取到其他包，去掉 project 里面的中标角色
			
 
				+        if len(old_prem)>1 and 'Project' in old_prem and 'win_tenderer' in str(new_prem): # 表格提取到中标人的，去掉project包中标人
			
 
				+            pro_winner = set()
			
 
				+            other_winner = set()
			
 
				+            for k in old_prem:
			
 
				+                for d in old_prem[k]['roleList']:
			
 
				+                    if d['role_name'] in ['win_tenderer', 'pre_win_tenderer']:
			
 
				+                        if k == 'Project':
			
 
				+                            pro_winner.add(d['role_text'])
			
 
				+                        else:
			
 
				+                            other_winner.add(d['role_text'])
			
 
				+            if pro_winner & other_winner != set():
			
 
				+                # print('过滤掉多包相同中标人在不同包')
			
 
				+                for d in old_prem['Project']['roleList']:
			
 
				+                    if d['role_name'] in ['win_tenderer', 'second_tenderer', 'third_tenderer']:
			
 
				+                        old_prem['Project']['roleList'].remove(d) # 提取到其他包，去掉 project 里面的中标角色
			
 
				         if multi_tendereeMoney and 'Project' in old_prem and float(old_prem['Project']['tendereeMoney'])!=0: # 表格提取到多标段招标金额，去掉Project包招标金额
			
 
				             old_prem['Project']['tendereeMoney'] = 0
			
 
				 
			
--- a/BiddingKG/dl/interface/header_set.pkl
+++ b/BiddingKG/dl/interface/header_set.pkl
--- a/BiddingKG/dl/interface/modelFactory.py
+++ b/BiddingKG/dl/interface/modelFactory.py
@@ -95,8 +95,13 @@ class Model_role_classify_word():
 
				         text = re.sub('[【（\[][0-9]{2,}[\]）】]|\d+([：:.-]\d+)+', 'd', text)
			
 
				         text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)
			
 
				         text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text)
			
 
				-        text = re.sub('序号：\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、', '', text)
			
 
				+        text = re.sub('序号：\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、|([^\w]|^)序：?\d+', '  d', text) # ，序：1，单位名称：
			
 
				         text = re.sub('(中标|成交|中选|入围)(工程|项目)', '工程', text)  # 修复易错分为中标人
			
 
				+        text = re.sub('约定', '  ', text) # 修复 233233636 错分为中标人 国有产权网上竞价有关约定 辽阳市公共资源交易中心 ，标
			
 
				+        text = re.sub('中介机构', '投标机构', text) # 251058999 错分为中标人 序号：2，中介机构名称：
			
 
				+        text = re.sub('(采购|招标)人名称、地址和联系方式：', '采购人：', text) # 275065998
			
 
				+        if re.search('(最终)?排名：', text) and re.search('(最终)?排名：第?[123一二三]', text)==None:
			
 
				+            text = re.sub('(最终)?排名：', '    ', text)
			
 
				         # text = re.sub('(采购|招标|发布)机构', '发布人', text)
			
 
				         return text.replace('(', '（').replace(')', '）').replace('單', '单').replace('稱','承').replace('標', '标').replace('採購', '采购').replace('機構', '机构')
			
 
				 
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -40,6 +40,12 @@ file = os.path.dirname(__file__) + '/agency_set.pkl'
 
				 with open(file, 'rb') as f:
			
 
				     agency_set = pickle.load(f)
			
 
				 
			
 
				+def is_agency(entity_text):
			
 
				+    if re.search('(招投?标|采购|代理|咨询|管理|物资|事务所?|顾问|监理|拍卖)[（）\w]{,4}(有限)?(责任)?公司|(采购|招投?标|交易|代理|咨询)[（）\w]{,4}(中心|服务所)|法院$',
			
 
				+                 entity_text) or entity_text in agency_set:
			
 
				+        return True
			
 
				+    return False
			
 
				+
			
 
				 from threading import RLock
			
 
				 dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
			
 
				               "prem":{"predictor":None,"Lock":RLock()},
			
@@ -794,13 +800,13 @@ class PREMPredict():
 
				             elif label in [2,3,4] and re.search('序号：\d+，\w{,2}候选', front):
			
 
				                 label = 5
			
 
				             elif label == 0:
			
 
				-                if re.search('拟邀请$|受邀谈判方', front):
			
 
				+                if re.search('拟邀请$|受邀谈判方|直购企业：$', front):
			
 
				                     label = 2
			
 
				                     values[label] = 0.501
			
 
				-                elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?|(采购|招标|发布)机构)(名称)?[是为：]+', front) and re.search('(招标|采购|咨询|代理|管理)\w*公司|(采购|交易)(中心|市场)', entity.entity_text):
			
 
				+                elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?|(采购|招标|发布)机构)(名称)?[是为：]+', front) and is_agency(entity.entity_text):
			
 
				                     label = 1
			
 
				                     values[label] = 0.501
			
 
				-                elif re.search('采用$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-
			
 
				+                elif re.search('采用$|异议受理部门', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-
			
 
				                     label = 5
			
 
				                 elif re.search('，单位名称：$', front) and re.search('^，(中标|中选)价格', behind):
			
 
				                     label = 2
			
@@ -824,14 +830,14 @@ class PREMPredict():
 
				                 elif re.search('税费', front) and re.search('^承担', behind):
			
 
				                     label = 5
			
 
				                 elif re.search('第一候补|第一后备|备选', front):
			
 
				-                    label = 2
			
 
				+                    label = 3
			
 
				                     values[label] = 0.6
			
 
				                 elif re.search('放弃中标资格$|是否中标：否|^(中标|成交)(公示|公告)', behind):
			
 
				                     values[2] = 0.5
			
 
				                     label = 5
			
 
				-                elif re.search('(承包权人|帐户名称)：$', front):
			
 
				+                elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单)：$', front):  # 234501112 民币元，序号：1，债务人： 东营市海宁工贸有限责任公司 ，债权本金： 262414286 八、中标后签约单位，合同签约单位：
			
 
				                     label = 5
			
 
				-                elif re.search('合同供方：?$', front):
			
 
				+                elif re.search('合同供方：?$|合同签约单位', front):
			
 
				                     label = 0
			
 
				                     values[label] = 0.5
			
 
				             elif re.search('是否中标：是，供应商', front) and label == 5:
			
@@ -849,11 +855,13 @@ class PREMPredict():
 
				                     values[label] = 0.501
			
 
				                 elif re.search('^：受', behind):  # 354009560 附件格式问题 ，中选中介服务机构通知书，编号：HZ2305120541，中汕项目管理有限公司：受惠东县人民政府大岭街道办事处委托
			
 
				                     label = 5
			
 
				-                elif re.search('发布机构', front) and re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站|馆)$', entity.entity_text):
			
 
				+                elif re.search('发布机构', front) and not is_agency(entity.entity_text):
			
 
				                     label = 0
			
 
				                     values[label] = 0.501
			
 
				                 elif re.search('开户银行：$', front): # 368214232 法定代表人：委托代理人：开户银行：鸡东建行
			
 
				                     label = 5
			
 
				+                elif re.search('委托$', front) and re.search('^(抽样|送检|看样)', behind):
			
 
				+                    label = 5
			
 
				             elif label in [3,4]:
			
 
				                 if re.search('第[二三]分(公司|店)，中标(人|供应商|单位|公司)：$', front):
			
 
				                     label = 2
			
@@ -922,7 +930,7 @@ class PREMPredict():
 
				                 label = 2
			
 
				             elif label == 1: # 错误中标金额处理
			
 
				                 if re.search('[:：，。](总金额|总价|单价|合价)(（万?元）)?：?$', front) and re.search('(中标|投标|成交|中价)', front)==None:
			
 
				-                    values[label] = 0.49
			
 
				+                    values[label] = 0.5
			
 
				                 elif re.search('[\+=]（(中标|成交)(金?额|价格?)|[若如]果?(中标|成交)(金?额|价格?)为?', front): # 处理例如 241561780 如中标金额为 500-1000万元，则代理服务费=100 万元×0.5％+400万元×0.35％+（中标金额－500）万元
			
 
				                     values[label] = 0.49
			
 
				                 elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[（）]?[+×*-][\d.%]+', behind):
			
@@ -1376,7 +1384,7 @@ class RoleRulePredictor():
 
				                                            "(：?单位名称|：?名称|盖章)?[,，]?([(（]按综合排名排序[)）]|：择优选取)?[：:,，]$)"  # 解决表头识别不到加逗号情况，需前面为，。空
			
 
				         self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)" \
			
 
				                                            "(：?单位名称|：?名称|盖章)?([(（]按综合排名排序[)）]|：择优选取)?[：:是为]+$" \
			
 
				-                                           "|结果公示如下：摇出球号：\d+号，中介机构：$)"  # 取消逗号 并拒绝执行改进计划的供应商，华新水泥将可能终止与其合作关系  # 中标候选人不能作为中标
			
 
				+                                           "|结果公示如下：摇出球号：\d+号，中介机构：$|直购企业：$)"  # 取消逗号 并拒绝执行改进计划的供应商，华新水泥将可能终止与其合作关系  # 中标候选人不能作为中标
			
 
				 
			
 
				         self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商)))|" \
			
 
				                                          "^((报价|价格)最低，|以\w{5,10}|\w{,20})?(确定|成|作)?为[\w“”（）]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[，。]" \
			
@@ -1423,7 +1431,7 @@ class RoleRulePredictor():
 
				         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
			
 
				         
			
 
				         self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源，?为\w{2,4}资金|采购成本价")  # |建安费用 不作为招标金额
			
 
				-        self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[）\)]?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬（含税）：")  # 单写 总价 不能作为中标金额，很多表格有单价、总价
			
 
				+        self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[）\)]?(综合)?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬（含税）：")  # 单写 总价 不能作为中标金额，很多表格有单价、总价
			
 
				         self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
			
 
				         self.pattern_money_other = re.compile("代理费|服务费")
			
 
				         self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[:：]?[\(（]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
			
@@ -1451,8 +1459,9 @@ class RoleRulePredictor():
 
				                         if _role == "tendereeORagency":  # 2022/3/9 新增不确定招标代理判断逻辑
			
 
				                             # print('p_entity_sentenceindex:', p_entity.sentence_index)
			
 
				 
			
 
				-                            if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', entity_text) \
			
 
				-                                    or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', entity_text) == None:
			
 
				+                            # if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', entity_text) \
			
 
				+                            #         or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', entity_text) == None:
			
 
				+                            if is_agency(entity_text):
			
 
				                                 _role = 'tenderee'
			
 
				                             else:
			
 
				                                 _role = "agency"
			
@@ -1586,7 +1595,8 @@ class RoleRulePredictor():
 
				                                             break
			
 
				                                         if str(_span[0][-len(str(_name)):]+_span[1] + _span[2][:len(str(_name))]).find(
			
 
				                                                 _name) >= 0:
			
 
				-                                            if p_entity.entity_text in agency_set or re.search('(代理|管理|咨询|招投?标|采购)\w{,6}公司', p_entity.entity_text): # 在代理人集合的作为代理人
			
 
				+                                            # if p_entity.entity_text in agency_set or re.search('(代理|管理|咨询|招投?标|采购)\w{,6}公司', p_entity.entity_text): # 在代理人集合的作为代理人
			
 
				+                                            if is_agency(p_entity.entity_text): # 2024/3/29 统一方法判断是否为代理
			
 
				                                                 find_flag = True
			
 
				                                                 _label = 1
			
 
				                                                 p_entity.label = _label
			
@@ -1819,8 +1829,11 @@ class RoleRuleFinalAdd():
 
				         :param list_codenames:
			
 
				         :return:
			
 
				         '''
			
 
				+
			
 
				         # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
			
 
				         main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
			
 
				+        if len(list_sentences[0])>0 and list_sentences[0][-1].in_attachment:
			
 
				+            main_sentences = list_sentences[0][-1:] + main_sentences[-2:]
			
 
				         if len(main_sentences)==0:
			
 
				             return 0
			
 
				         # end_tokens = []
			
@@ -1834,7 +1847,19 @@ class RoleRuleFinalAdd():
 
				             # sear_ent = re.search('[，。]([\u4e00-\u9fa5()（）]{5,20})，?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
			
 
				             sear_ent = re.search('([，。；]|^)(?P<entity>[\u4e00-\u9fa5()（）]{5,20}(，?[\u4e00-\u9fa5]{,8})?)，?\s*(公告日期：)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
			
 
				             if sear_ent:
			
 
				+                b, e = sear_ent.span()
			
 
				+                if re.search('报价记录|竞价成交', text_end[max(b-10, 0):b] + text_end[e:]):
			
 
				+                    sear_ent = None
			
 
				                 break
			
 
				+        if sear_ent == None:
			
 
				+            text_end = list_articles[0].content[-100:]
			
 
				+            sear_ent = re.search(
			
 
				+                '([，。；]|^)(?P<entity>[\u4e00-\u9fa5()（）]{5,20}(，?[\u4e00-\u9fa5]{,8})?)，?\s*(公告日期：)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?',
			
 
				+                text_end)
			
 
				+            if sear_ent:
			
 
				+                b, e = sear_ent.span()
			
 
				+                if re.search('报价记录|竞价成交', text_end[max(b-10, 0):b] + text_end[e:]):
			
 
				+                    sear_ent = None
			
 
				         sear_ent1 = re.search('((招标|采购)联系人)[，:：][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()（）]{4,20})', list_articles[0].content[:5000])
			
 
				         sear_ent2 = re.search('[，：](户名|开户名称|发票抬头|单位名称|名称)[:：](?P<entity>[\u4e00-\u9fa5()（）]{5,20})[，。]', list_articles[0].content[:5000])
			
 
				         if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('投标报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
			
@@ -1850,14 +1875,14 @@ class RoleRuleFinalAdd():
 
				         ents = []
			
 
				         for ent in list_entitys[0]:
			
 
				             if ent.entity_type in ['org', 'company']:
			
 
				-                if ent.label == 0 and ent.values[ent.label]>0.5:
			
 
				+                if ent.label == 0 and ent.values[ent.label]>0.55:
			
 
				                     if '公共资源交易中心' in ent.entity_text:  # 公共资源交易中心不算招标或代理，只算平台
			
 
				                         # ent.label = 5
			
 
				                         ent.values[ent.label] = 0.6 if ent.values[ent.label]>0.6 else 0.5 # 改为降低概率，不改类别，防止 382573066 明显招标人表达不提取
			
 
				                         continue
			
 
				                     tenderee_list.append(ent.entity_text)
			
 
				                     tenderee_notfound = False
			
 
				-                elif ent.label == 1:
			
 
				+                elif ent.label == 1 and ent.values[ent.label]>0.55:
			
 
				                     agency_list.append(ent.entity_text)
			
 
				                     agency_notfound = False
			
 
				                 elif ent.label == 5:
			
@@ -1869,33 +1894,24 @@ class RoleRuleFinalAdd():
 
				                 ent_re = _sear_ent.group('entity')
			
 
				                 ent_re = ent_re.replace('，', '').replace("(","（").replace(")","）")
			
 
				 
			
 
				-                if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站)$', ent_re)
			
 
				-                                                  or re.search('(招投?标|采购|代理|咨询|管理)(服务)?(有限)?(责任)?公司|(采购|招投?标|交易|代理)(服务)?中心|(招标|代理|咨询|管理|监理)', ent_re) == None) \
			
 
				-                        and ent_re not in agency_list and ent_re not in agency_set:
			
 
				+                if tenderee_notfound or agency_notfound:
			
 
				                     n = 0
			
 
				                     for i in range(len(ents) - 1, -1, -1):
			
 
				                         if not ents[i].in_attachment:
			
 
				                             n += 1
			
 
				                         if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
			
 
				                             break
			
 
				-                        if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and re.search('(大学|中学|小学|幼儿园|医院)$', ents[i].entity_text)) or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
			
 
				-                            ents[i].label = 0
			
 
				-                            ents[i].values[0] = 0.51 # 修改为比标题概率略高
			
 
				-                            tenderee_notfound = False
			
 
				-                            # log('正则最后补充实体： %s'%(ent_re))
			
 
				-                            break
			
 
				-                elif agency_notfound == True and ent_re not in tenderee_list and (
			
 
				-                        re.search('(招投?标|采购|代理|咨询|管理)(服务)?(有限)?(责任)?公司|(采购|招投?标|交易|代理)(服务)?中心|(招标|代理|咨询|管理|监理)', ent_re) or ent_re in agency_set):
			
 
				-                    n = 0
			
 
				-                    for i in range(len(ents) - 1, -1, -1):
			
 
				-                        if not ents[i].in_attachment:
			
 
				-                            n += 1
			
 
				-                        if n > 3 and _sear_ent==sear_ent:  # 文章末尾角色加日期这种只找后三个实体
			
 
				+                        elif _sear_ent==sear_ent and ents[i].label != 5:  # 后面有角色的实体的停止继续往前
			
 
				                             break
			
 
				-                        if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
			
 
				-                            ents[i].label = 1
			
 
				-                            ents[i].values[1] = 0.51 # 修改为比标题概率略高
			
 
				-                            agency_notfound = False
			
 
				+                        if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and re.search('(大学|中学|小学|幼儿园|医院)$', ents[i].entity_text)) or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
			
 
				+                            if agency_notfound and is_agency(ents[i].entity_text) and ents[i].entity_text not in tenderee_list:
			
 
				+                                ents[i].label = 1
			
 
				+                                ents[i].values[1] = 0.51 # 修改为比标题概率略高
			
 
				+                                agency_notfound = False
			
 
				+                            elif tenderee_notfound and not is_agency(ents[i].entity_text) and ents[i].entity_text not in agency_list:
			
 
				+                                ents[i].label = 0
			
 
				+                                ents[i].values[0] = 0.51 # 修改为比标题概率略高
			
 
				+                                tenderee_notfound = False
			
 
				                             # log('正则最后补充实体： %s'%(ent_re))
			
 
				                             break
			
 
				                     if not tenderee_notfound:
			
@@ -2191,7 +2207,7 @@ class RoleGrade():
 
				         self.tenderee_left_5 = "(?P<tenderee_left_5>(发布)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|买方|发布机构)"
			
 
				         self.agency_left_9 = "(?P<agency_left_9>代理)"
			
 
				         self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]名|排[名序]：1|名次：1)"
			
 
				-        self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方))"
			
 
				+        self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方|最[终后]选[择取]))"  # 229435497 最后选择西平，县中原彩印有限公司，作为此项目中标供应商，
			
 
				         self.winTenderer_left_6 = "(?P<winTenderer_left_6>(入围|承[接建包修做制担租销]))"
			
 
				         self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]：2|名次：2))"
			
 
				         self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]：3|名次：3))"
			
@@ -2212,6 +2228,10 @@ class RoleGrade():
 
				         org_winner = []
			
 
				         company_winner = []
			
 
				         org_tenderee = []
			
 
				+        agency_l = []
			
 
				+        agency_like_tenderee = [] # 类似招标人的代理人实体列表
			
 
				+        low_prob_agency = []
			
 
				+        low_prob_tenderee = []
			
 
				         for entity in list_entitys[0]:
			
 
				             if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> min_prob:
			
 
				                 text = sentences[entity.sentence_index].sentence_text
			
@@ -2220,6 +2240,16 @@ class RoleGrade():
 
				                 b = entity.wordOffset_begin
			
 
				                 e = entity.wordOffset_end
			
 
				                 not_found = 1
			
 
				+                if re.search('(乙方：甲方：|甲方：乙方：)$', text[max(0, b-span):b]):
			
 
				+                    entity.label = 0 if entity.entity_type == 'org' else 2
			
 
				+                    entity.values[entity.label] = 0.55
			
 
				+                    continue
			
 
				+                elif re.search('(采购|招标)人（?或(采购|招标)?代理机构）?：$', text[max(0, b-span):b]):
			
 
				+                    entity.label = 1 if is_agency(entity.entity_text) else 0
			
 
				+                    entity.values[entity.label] = 0.8
			
 
				+                    continue
			
 
				+                elif re.search('(采购|招标|询比?价|遴选|寻源|比选)机构[是为：]+', text[max(0, b-span):b]) and not is_agency(entity.entity_text):
			
 
				+                    agency_like_tenderee.append(entity)
			
 
				                 for pattern in self.pattern_list:
			
 
				                     if 'left' in pattern:
			
 
				                         context = text[max(0, b-span):b]
			
@@ -2262,10 +2292,31 @@ class RoleGrade():
 
				                         company_winner.append(entity)  # 保存中标人实体
			
 
				                 if entity.label == 0 and entity.values[entity.label]> min_prob:
			
 
				                     org_tenderee.append(entity.entity_text)  # 保存所有招标人名称
			
 
				-            if entity.entity_type in ['org', 'company'] and entity.label == 0 and entity.entity_text in agency_set and entity.values[entity.label]<0.6:  # 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
			
 
				-                # log('修改概率小于0.6的且在大数据代理集合里面的招标人为代理人%s:'%entity.entity_text)
			
 
				+                elif entity.label == 1 and entity.values[entity.label]> min_prob:
			
 
				+                    agency_l.append(entity.entity_text)
			
 
				+            # if entity.entity_type in ['org', 'company'] and entity.label == 0 and entity.entity_text in agency_set and entity.values[entity.label]<0.6:  # 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
			
 
				+            #     # log('修改概率小于0.6的且在大数据代理集合里面的招标人为代理人%s:'%entity.entity_text)
			
 
				+            #     entity.label = 1
			
 
				+            #     entity.values[entity.label] = 0.5
			
 
				+
			
 
				+            elif entity.entity_type in ['org', 'company'] and entity.label in [1, 0] and 0.5<=entity.values[entity.label]<0.6:
			
 
				+                if entity.label == 1:
			
 
				+                    low_prob_agency.append(entity)
			
 
				+                else:
			
 
				+                    low_prob_tenderee.append(entity)
			
 
				+
			
 
				+        if org_tenderee == [] and agency_like_tenderee:
			
 
				+            for entity in agency_like_tenderee:
			
 
				+                entity.label = 0
			
 
				+                entity.values[entity.label] = 0.6
			
 
				+        for entity in low_prob_agency:
			
 
				+            if entity.entity_text in org_tenderee:
			
 
				+                entity.label = 0
			
 
				+                entity.values[entity.label] = 0.6
			
 
				+        for entity in low_prob_tenderee:
			
 
				+            if entity.entity_text in agency_l:
			
 
				                 entity.label = 1
			
 
				-                entity.values[entity.label] = 0.5
			
 
				+                entity.values[entity.label] = 0.6
			
 
				 
			
 
				         if org_winner != []:
			
 
				             flag = 0
			
@@ -5874,8 +5925,10 @@ class TableTag2List():
 
				                         if text_process != None:
			
 
				                             # text = [re.sub('\xa0', '', text_process(cell, final=False)), 0]
			
 
				                             # td_text = re.sub('\xa0', '', text_process(cell, final=False))
			
 
				-                            td_text = re.sub('\s|\xa0', '', str(cell.get_text())) # 修复 370835008 td 内公司被p标签拆分为两半情况
			
 
				-                            if len(td_text)>30:
			
 
				+                            td_text = re.sub('\s|\xa0', '', str(cell.get_text()))  # 修复 370835008 td 内公司被p标签拆分为两半情况
			
 
				+                            if 'title' in cell.attrs and cell.get_text().strip().endswith('...') and cell.get_text().strip()[:-3] in cell.attrs['title']:
			
 
				+                                td_text = cell.attrs['title']  # 修复 类似 215597851 省略号隐藏内容
			
 
				+                            elif len(td_text)>30:
			
 
				                                 td_text = re.sub('\xa0', '', text_process(cell, final=False))
			
 
				                             if td_text == "":
			
 
				                                 td_text = ' '
			
@@ -5953,7 +6006,7 @@ class TablePremExtractor(object):
 
				 
			
 
				 
			
 
				     def find_header(self, td_list):
			
 
				-        fix_td_list = [re.sub('[:：]$|^[一二三四五六七八九十0-9]{1,3}、|(（[\w、×*/]{1,20}）)$|（不?含税）|/万?元|拟', '', it) for it in td_list]  # 去除表头无关信息，方便匹配判断是否为表头
			
 
				+        fix_td_list = [re.sub('[:：]$|^[一二三四五六七八九十0-9]{1,3}、|(（[\w、×*/]{1,20}）)$|（不?含税）|/万?元|拟|\s', '', it) for it in td_list]  # 去除表头无关信息，方便匹配判断是否为表头
			
 
				         header_dic = dict()
			
 
				         flag = False
			
 
				         contain_header = False
			
@@ -6059,14 +6112,14 @@ class TablePremExtractor(object):
 
				             return {}
			
 
				         for i in df.index:
			
 
				             same_package = False  # 连续重复包号，一般是 rowspan 造成；一包 多个采购
			
 
				-            project_code = df.loc[i, headers['project_code'][0]] if "project_code" in headers else ""
			
 
				-            package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
			
 
				-            project_name = df.loc[i, headers['project_name'][0]] if "project_name" in headers else ""
			
 
				-            tenderee = df.loc[i, headers['tenderee'][0]] if "tenderee" in headers else ""
			
 
				-            tenderer = df.loc[i, headers['tenderer'][0]] if "tenderer" in headers else ""
			
 
				-            budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
			
 
				-            bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
			
 
				-            win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
			
 
				+            project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
			
 
				+            package_code_raw = df.loc[i, headers['package_code'][0]].strip() if "package_code" in headers else ""
			
 
				+            project_name = df.loc[i, headers['project_name'][0]].strip() if "project_name" in headers else ""
			
 
				+            tenderee = df.loc[i, headers['tenderee'][0]].strip() if "tenderee" in headers else ""
			
 
				+            tenderer = df.loc[i, headers['tenderer'][0]].strip() if "tenderer" in headers else ""
			
 
				+            budget_ = df.loc[i, headers['budget'][0]].strip() if "budget" in headers else ""
			
 
				+            bid_amount_ = df.loc[i, headers['bid_amount'][0]].strip() if "bid_amount" in headers else ""
			
 
				+            win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
			
 
				 
			
 
				             if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配
			
 
				                 # print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
			
@@ -6090,7 +6143,7 @@ class TablePremExtractor(object):
 
				 
			
 
				             if win_sort != "" and re.search('排名|排序|名次|推荐顺序', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取  防止类似 328485591 作为多包
			
 
				                 break
			
 
				-            if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and re.search('否|未(中标|成交|中选)', win_sort):
			
 
				+            if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and (re.search('否|未(中标|成交|中选)', win_sort) or win_sort==''): # 2024/04/2 修复 252208201 为空的不中标
			
 
				                 continue
			
 
				             if "win_sort" in headers and win_sort == "": # '表头有是否中标，内容却空白的，过滤掉'
			
 
				                 continue
			
@@ -6303,7 +6356,7 @@ class CandidateExtractor(object):
 
				             self.headerset = pickle.load(f)
			
 
				 
			
 
				     def find_header(self, td_list):
			
 
				-        fix_td_list = [re.sub('[:：]$|^[一二三四五六七八九十0-9]{1,3}、|(（[\w、×*/]{1,20}）)$|（不?含税）|/万?元|拟', '', it) for it in td_list] # 去除表头无关信息，方便匹配判断是否为表头
			
 
				+        fix_td_list = [re.sub('[:：]$|^[一二三四五六七八九十0-9]{1,3}、|(（[\w、×*/]{1,20}）)$|（不?含税）|/万?元|拟|\s', '', it) for it in td_list] # 去除表头无关信息，方便匹配判断是否为表头
			
 
				         header_dic = dict()
			
 
				         flag = False
			
 
				         contain_header = False
			
@@ -6384,23 +6437,28 @@ class CandidateExtractor(object):
 
				         findtop3 = False
			
 
				         findmoney = False
			
 
				         line_num = 0
			
 
				+        line_package = None
			
 
				         for i in df.index:
			
 
				-            package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
			
 
				-            candidate_ = df.loc[i, headers['candidate'][0]] if "candidate" in headers else ""
			
 
				-            win_or_not = df.loc[i, headers['win_or_not'][0]] if "win_or_not" in headers else ""
			
 
				+            package_code_raw = df.loc[i, headers['package_code'][0]].strip() if "package_code" in headers else ""
			
 
				+            candidate_ = df.loc[i, headers['candidate'][0]].strip() if "candidate" in headers else ""
			
 
				+            win_or_not = df.loc[i, headers['win_or_not'][0]].strip() if "win_or_not" in headers else ""
			
 
				             # budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
			
 
				-            bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
			
 
				-            win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
			
 
				-            win_tenderer = df.loc[i, headers['win_tenderer'][0]] if "win_tenderer" in headers else ""
			
 
				-            second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
			
 
				-            third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
			
 
				+            bid_amount_ = df.loc[i, headers['bid_amount'][0]].strip() if "bid_amount" in headers else ""
			
 
				+            win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
			
 
				+            win_tenderer = df.loc[i, headers['win_tenderer'][0]].strip() if "win_tenderer" in headers else ""
			
 
				+            second_tenderer = df.loc[i, headers['second_tenderer'][0]].strip() if "second_tenderer" in headers else ""
			
 
				+            third_tenderer = df.loc[i, headers['third_tenderer'][0]].strip() if "third_tenderer" in headers else ""
			
 
				 
			
 
				             if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头， 停止匹配 # 排除 ,win_sort 避免367940050漏提取
			
 
				                 # print('包含表头， 停止匹配')
			
 
				                 break
			
 
				             if len(set([package_code_raw, candidate_,win_sort, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2:  # 全部为空或内容一样 停止匹配
			
 
				                 # print('全部为空或内容一样 停止匹配')
			
 
				-                break
			
 
				+                if len(set(df.loc[i,:]))==1 and re.search('^第?([一二三四五六七八九十]{1,3}|[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段])([一二三四五六七八九十]{1,3}|[a-zA-Z0-9-]{,9})?$', win_sort):
			
 
				+                    line_package = win_sort
			
 
				+                    continue
			
 
				+                else:
			
 
				+                    break
			
 
				 
			
 
				             if candidate_ != "" and win_sort == "" and headers['candidate'][0] > 0: # 修复某些表头不说 排名，直接用候选人代替
			
 
				                 col_indx = headers['candidate'][0] -1
			
@@ -6411,6 +6469,8 @@ class CandidateExtractor(object):
 
				                     win_sort = pre_col
			
 
				 
			
 
				             package_code = package_code_raw
			
 
				+            if package_code == '' and line_package:
			
 
				+                package_code = line_package
			
 
				 
			
 
				             # candidate = candidate_ if self.is_role(candidate_) else ""
			
 
				             # tenderer = tenderer if self.is_role(tenderer) else ""
			
@@ -6637,6 +6697,10 @@ class WebsourceTenderee():
 
				         '''
			
 
				         p = '(医院|学院|学校|中学|小学|大学|幼儿园|保健院|党校|银行|研究院|血站|红十字会|防治院|研究所)'
			
 
				         web_ree = self.webno2ree.get(web_source_no, '')
			
 
				+        if web_source_no.startswith('18591-') and web_ree == "":
			
 
				+            web_ree = '中国人民解放军总医院'
			
 
				+        elif web_source_no.startswith('Y00484-') and web_ree == "":
			
 
				+            web_ree = '航空总医院'
			
 
				         if web_ree != '':
			
 
				             if 'Project' in prem[0]['prem']:
			
 
				                 find_tenderee = False
			
@@ -7035,18 +7099,18 @@ if __name__=="__main__":
 
				     # rs = product_attr.predict(docid='', html=html, page_time="")
			
 
				     # print(rs)
			
 
				 
			
 
				-    docid = ""
			
 
				-    title = ''
			
 
				-    with open('d:/html/2.html', 'r', encoding='utf-8') as f:
			
 
				-        html = f.read()
			
 
				-    tb_extract = TablePremExtractor()
			
 
				-    rs = tb_extract.predict(html, [
			
 
				-        "广东省广裕集团嘉顺实业有限责任公司",
			
 
				-        "广州顺为招标采购有限公司",
			
 
				-        "中华人民共和国"
			
 
				-    ], web_source_name = '河钢供应链管理平台')
			
 
				-    print('标段数：',len(rs))
			
 
				-    print(rs)
			
 
				+    # docid = ""
			
 
				+    # title = ''
			
 
				+    # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
			
 
				+    #     html = f.read()
			
 
				+    # tb_extract = TablePremExtractor()
			
 
				+    # rs = tb_extract.predict(html, [
			
 
				+    #     "广东省广裕集团嘉顺实业有限责任公司",
			
 
				+    #     "广州顺为招标采购有限公司",
			
 
				+    #     "中华人民共和国"
			
 
				+    # ], web_source_name = '河钢供应链管理平台')
			
 
				+    # print('标段数：',len(rs))
			
 
				+    # print(rs)
			
 
				 
			
 
				     # # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
			
 
				     # # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]