فهرست منبع

优化表格多包提取;优化站源唯一招标人替换;优化金额提取

lsm 1 سال پیش
والد
کامیت
ee9e1b8c33

+ 2 - 0
BiddingKG/dl/interface/Entitys.py

@@ -340,6 +340,8 @@ class Role():
         result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
                   'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
                   'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}
+        if result['role_name'] == 'tenderee':
+            result['role_prob'] = self.role_prob
         return result
 
 # 用于KM算法的组合配对

+ 17 - 10
BiddingKG/dl/interface/Preprocessing.py

@@ -2525,14 +2525,18 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
     # 使用正则识别金额
     entity_type = "money"
     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|资金|(控制|拦标)价)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
+                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
                           "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
     # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
 
-    pattern_money = re.compile("%s|%s|%s|%s" % (
-    list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
-    list_money_pattern["front_m"]))
+    # pattern_money = re.compile("%s|%s|%s|%s" % (
+    # list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
+    # list_money_pattern["front_m"]))
+
+    pattern_money = re.compile("%s|%s|%s" % (
+        list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"]
+        )) # 2024/01/30  改为 list_money_pattern["front_m"] 单独搜索,避免与 key_word 冲突  详见合同元,合同金额:378.8万元,
 
     if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
         found_yeji += 1
@@ -2541,9 +2545,11 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
     else:
         ser = re.search('((收费标准|计算[方公]?式):|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s()()\[\]【】\d\.%%‰\+\-*×/]{20,}[,。]?', sentence_text)  # 过滤掉收费标准里面的金额
         if ser:
-            all_match = re.finditer(pattern_money, sentence_text.replace(ser.group(0), ' ' * len(ser.group(0))))
-        else:
-            all_match = re.finditer(pattern_money, sentence_text)
+            sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
+        all_match = re.finditer(pattern_money, sentence_text)
+        if all_match == None:
+            all_match = re.finditer(list_money_pattern["front_m"], sentence_text)
+    # print('all_match:', all_match)
     for _match in all_match:
         # print('_match: ', _match.group())
         if len(_match.group()) > 0:
@@ -2678,9 +2684,10 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
             elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
                            sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
                 notes = '成本警戒线'
-            elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为:]', sentence_text[_match.span()[0]:_match.span()[1]]):
-                cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
-                notes = cost_re.group(1)
+            elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为:]|服务金额', sentence_text[_match.span()[0]:_match.span()[1]]):
+                # cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
+                # notes = cost_re.group(1)
+                notes = '招标或中标金额'
             elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
                 notes = '单价'
             elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:

+ 2 - 2
BiddingKG/dl/interface/extract.py

@@ -277,7 +277,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     cost_time["attrs"] = round(time.time()-start_time,2)
 
     '''表格要素提取'''
-    table_prem = predictor.getPredictor("tableprem").predict(text, nlp_enterprise)
+    table_prem = predictor.getPredictor("tableprem").predict(text, nlp_enterprise, web_source_name)
     # print('表格提取中标人:', table_prem)
     # print('原提取角色:', prem[0]['prem'])
     if table_prem:
@@ -345,7 +345,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-01-29'}
+    version_date = {'version_date': '2024-02-01'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
 
     '''最终检查修正招标、中标金额'''

+ 14 - 6
BiddingKG/dl/interface/getAttributes.py

@@ -311,7 +311,7 @@ def get_legal_comba(list_entity,dict_role_combination):
                 _dict = dict()
                 for _key in item.keys():
                     _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
-                _prob = getSumExpectation(dict_pack_entity_prob, _dict)
+                _prob, max_role_prob = getSumExpectation(dict_pack_entity_prob, _dict)
                 if _prob>MAX_PROB:
                     MAX_PROB = _prob
                     _MAX_PROB_COMBA = [item]
@@ -396,11 +396,14 @@ def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
     '''
     #修改为同一个实体只取对应包-角色的最大的概率值
     expect = 0
+    max_prob = 0
     dict_entity_prob = {}
     for _key_pack_entity in dict_pack_entity_prob:
         _key_pack = _key_pack_entity.split("$text$")[0]
         role_prob = dict_pack_entity_prob[_key_pack_entity][1]
         if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
+            if role_prob > max_prob:
+                max_prob = role_prob
             if _key_pack_entity in dict_entity_prob.keys():
                 if dict_entity_prob[_key_pack_entity]<role_prob:
                     dict_entity_prob[_key_pack_entity] = role_prob
@@ -435,7 +438,7 @@ def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
     for _key in dict_entity_prob.keys():
         symbol = 1 if dict_entity_prob[_key]>0 else -1 
         expect += symbol*math.pow(dict_entity_prob[_key],2)
-    return expect
+    return expect, max_prob
 
 
 def getRoleList(list_sentence,list_entity,on_value = 0.5):
@@ -517,7 +520,9 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
     _index = 0
     dict_pack_entity_prob = get_dict_entity_prob(list_entity)
     for item_combination in list_real_comba:
-        expect = getSumExpectation(dict_pack_entity_prob, item_combination)
+        expect, max_role_prob = getSumExpectation(dict_pack_entity_prob, item_combination)
+        for k, v in item_combination.items():
+            item_combination[k] = [v, max_role_prob]
         if expect>max_expect:
             max_index = _index
             max_expect = expect
@@ -529,12 +534,14 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
             packageName = _key.split("$$")[0]
             label = _key.split("$$")[1]
             role_name = dict_role_id.get(str(label))
-            entity_text = list_real_comba[max_index][_key]
+            # entity_text = list_real_comba[max_index][_key]
+            entity_text = list_real_comba[max_index][_key][0]
+            entity_prob = list_real_comba[max_index][_key][1]
             if packageName in dict_PackageCode.keys():
                 packagecode = dict_PackageCode.get(packageName)
             else:
                 packagecode = ""
-            RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[]))
+            RoleList.append(PREM(packageName,packagecode,role_name,entity_text,entity_prob,0,0.0,[]))
             RoleSet.add(entity_text)
 
     #根据最优树来修正list_entity中角色对包的连接
@@ -2875,7 +2882,8 @@ def initPackageAttr(RoleList,PackageSet):
         if packDict[item.packageName]["code"] =="":
             packDict[item.packageName]["code"] = item.packageCode
         # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
-        packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位)
+        # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位)
+        packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,item.role_prob,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位)
     return packDict
                 
 def getPackageRoleMoney(list_sentence,list_entity,list_outline):

+ 40 - 25
BiddingKG/dl/interface/predictor.py

@@ -26,6 +26,7 @@ from bs4 import BeautifulSoup
 import copy
 import calendar
 import datetime
+from BiddingKG.dl.entityLink.entityLink import get_business_data
 # import fool   # 统一用 selffool ,阿里云上只有selffool 包
 
 cpu_num = int(os.environ.get("CPU_NUM",0))
@@ -928,9 +929,9 @@ class PREMPredict():
             elif label ==0: # 错误招标金额处理
                 if entity.notes in ["投资", "总投资","工程造价"] or re.search('最低限价:?$', front) or re.search('服务内容:([\d,.]+万?亿?元?-?)$', front):
                     values[label] = 0.49
-                elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+', behind):
+                elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+|(含)', behind):
                     values[label] = 0.49
-                elif re.search('(含|在|包括|[大小等高低]于)$|[\d.%]+[+×*-]$', front):
+                elif re.search('(含|在|包括|[大小等高低]于|如预算金额为)$|[\d.%]+((含))?[+×*-]$', front):
                     values[label] = 0.49
             elif re.search('报价:预估不?含税总价[为:]$', front) and (label != 1 or values[label]<0.5):
                 label = 1
@@ -938,12 +939,13 @@ class PREMPredict():
             entity.set_Money(label, values)
 
     def correct_money_by_rule(self, title, list_entitys, list_articles):
-        if len(re.findall('监理|施工|设计|勘察', title)) == 1 and re.search('施工|总承包|epc|EPC', title) == None:
-            keyword = re.search('监理|设计|勘察', title).group(0)
+        if (len(re.findall('监理|施工|设计|勘察', title)) == 1 and re.search('施工|总承包|epc|EPC', title) == None) or re.search('服务金额', list_articles[0].content):
+            # keyword = re.search('监理|设计|勘察', title).group(0)
             for list_entity in list_entitys:
                 for _entity in list_entity:
                     # print('keyword:',keyword, '_entity.notes :',_entity.notes)
-                    if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label == 2:
+                    # if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label == 2:
+                    if _entity.entity_type == "money" and _entity.notes == '招标或中标金额' and _entity.label == 2:
                         # if channel_dic['docchannel'] == "招标公告":
                         if re.search('中标|成交|中选|中价|中租|结果|入围', title + list_articles[0].content[:100]) == None:
                             _entity.values[0] = 0.51
@@ -1417,7 +1419,7 @@ class RoleRulePredictor():
 
         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
         
-        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?为\w{2,4}资金|服务金额|采购成本价")  # |建安费用 不作为招标金额
+        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?为\w{2,4}资金|采购成本价")  # |建安费用 不作为招标金额
         self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):")  # 单写 总价 不能作为中标金额,很多表格有单价、总价
         self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
         self.pattern_money_other = re.compile("代理费|服务费")
@@ -5789,7 +5791,7 @@ class TablePremExtractor(object):
         self.head_rule_dic = {
             'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
-            "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|通用|主要标的)(名称?|内容)",
+            "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|通用|主要标的|^包)(名称?|内容)",
             "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序",
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
@@ -5825,6 +5827,8 @@ class TablePremExtractor(object):
                     if re.search(v, text):
                         if k  in ['tenderer'] and re.search('是否', text):
                             continue
+                        if k in header_dic:
+                            continue
                         header_dic[k] = (i, text)
                         num += 1
                 if num>1:
@@ -5893,7 +5897,7 @@ class TablePremExtractor(object):
         else:
             return ''
 
-    def extract_from_df(self, df, headers):
+    def extract_from_df(self, df, headers, web_source_name):
         prem_dic = {}
         previous_package = ""  # 上一行包号
         multi_same_package = False # 非连续的重复包号
@@ -5901,6 +5905,10 @@ class TablePremExtractor(object):
         link_set = set()
         not_package = True if 'project_name' in headers and re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1]) and \
                           'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False
+
+        if set(['project_code', 'package_code', 'tenderee', 'tenderer']) & set(headers) == set(): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683
+            # print('没有包号及角色的不要')
+            return {}
         for i in df.index:
             same_package = False  # 连续重复包号,一般是 rowspan 造成;一包 多个采购
             project_code = df.loc[i, headers['project_code'][0]] if "project_code" in headers else ""
@@ -5961,7 +5969,11 @@ class TablePremExtractor(object):
                 link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
 
             package = uniform_package_name(package_code) if package_code else str(len(prem_dic)+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
-
+            if project_code != "":
+                uni_project_code= uniform_package_name(project_code)
+                if uni_project_code != "" and package != "":
+                    # print('重组包号:', '%s_%s'%(uni_project_code, package))
+                    package = '%s_%s'%(uni_project_code, package)
             if package_code_raw!='':
                 if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
                     package_fix2raw[package] = package_code_raw
@@ -5991,7 +6003,7 @@ class TablePremExtractor(object):
                               budget_header + budget_) and budget < 100) or budget > 50000000000:  # 如果是费率或大于500亿的金额改为0
                     budget = 0
                 if budget > 0:
-                    if same_package and prem_dic[package]['tendereeMoney'] != budget: #
+                    if same_package and prem_dic[package]['tendereeMoney'] != budget: # 处理 类似 136839070 一包多物品多预算
                         prem_dic[package]['tendereeMoney'] += budget
                     else:
                         prem_dic[package]['tendereeMoney'] = budget
@@ -6017,7 +6029,7 @@ class TablePremExtractor(object):
                     break
 
                 bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率', bid_amount_)==None and 'bid_amount' in headers else (0, '')
-                if 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
+                if web_source_name == '河钢供应链管理平台' and 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
                     if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的包 丢弃
                         prem_dic.pop(package)
                     continue
@@ -6042,14 +6054,14 @@ class TablePremExtractor(object):
                 })
             if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的 丢弃 并不再继续往下匹配
                 prem_dic.pop(package)
-                break
+                # break # 注释掉避免 400084571 某些包废标 中断匹配
             if multi_same_package: # 预处理后包号重复的,使用原始包号
                 for k, v in package_fix2raw.items():
                     if k in prem_dic:
                         prem_dic[v] = prem_dic.pop(k)
         return prem_dic
 
-    def get_prem(self, soup):
+    def get_prem(self, soup, web_source_name=''):
         tables = soup.find_all('table')
         tables.reverse()
 
@@ -6088,7 +6100,7 @@ class TablePremExtractor(object):
                             break
                     if len(table_items) > 0:
                         df = pd.DataFrame(table_items)
-                        prem_ = self.extract_from_df(df, headers)
+                        prem_ = self.extract_from_df(df, headers, web_source_name)
                         # rs_dic.update(prem_)
                         table_prem.update(prem_)
                     i = j - 1
@@ -6106,7 +6118,7 @@ class TablePremExtractor(object):
             table.extract()
         return rs_dic
 
-    def predict(self, html, nlp_enterprise):
+    def predict(self, html, nlp_enterprise, web_source_name=""):
         html = re.sub("<html>|</html>|<body>|</body>","",html)
         html = re.sub("##attachment##","",html)
         soup = BeautifulSoup(html, 'lxml')
@@ -6114,9 +6126,9 @@ class TablePremExtractor(object):
         self.nlp_enterprise = nlp_enterprise
         if richText:
             richText = richText.extract()  # 过滤掉附件
-        prem = self.get_prem(soup)
+        prem = self.get_prem(soup, web_source_name)
         if prem == {} and richText:
-            prem = self.get_prem(richText)
+            prem = self.get_prem(richText, web_source_name)
         if len(prem) == 1:  # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
             k = list(prem)[0]
             if k == '1' or len(k) > 2:
@@ -6487,6 +6499,8 @@ class WebsourceTenderee():
                             d['role_text'] = web_ree
                         elif re.search('大学$', web_ree) and re.search('学院$', d['role_text']) and web_ree not in d['role_text']:
                             d['role_text'] = web_ree
+                        elif d.get('role_prob', 0) < 0.8 and get_business_data(d['role_text'])[0] == False: # 20240201 概率低于0.8且没有工商数据的替换为站源招标人
+                            d['role_text'] = web_ree
                         # elif re.search(p, web_ree) and (re.search(p, d['role_text'])==None and len(d['role_text'])<6): # 数据源唯一招标人以医院等结尾,角色中无相关关键词的,替换为数据源招标人
                         #     d['role_text'] = web_ree
                         # elif re.search('有限(责任)?公司', web_ree) and (re.search('有限(责任)?公司', d['role_text'])==None and len(d['role_text'])<6):
@@ -6865,13 +6879,13 @@ if __name__=="__main__":
     #     # print("cost_time:", json.loads(requests_result.text)['cost_time'])
     #     # print(MAX_LEN, len(sentence), len(list_sentence))
 
-    docid = ""
-    title = ''
-    with open('d:/html/2.html', 'r', encoding='utf-8') as f:
-        html = f.read()
-    product_attr = ProductAttributesPredictor()
-    rs = product_attr.predict(docid='', html=html, page_time="")
-    print(rs)
+    # docid = ""
+    # title = ''
+    # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
+    #     html = f.read()
+    # product_attr = ProductAttributesPredictor()
+    # rs = product_attr.predict(docid='', html=html, page_time="")
+    # print(rs)
 
     docid = ""
     title = ''
@@ -6882,7 +6896,8 @@ if __name__=="__main__":
         "广东省广裕集团嘉顺实业有限责任公司",
         "广州顺为招标采购有限公司",
         "中华人民共和国"
-    ])
+    ], web_source_name = '河钢供应链管理平台')
+    print('标段数:',len(rs))
     print(rs)
 
     # # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]