Sfoglia il codice sorgente

添加附件多包标识,有表格提取的去掉附件包不在表格标段;无标段多产品同中标人不作为多包;只有县区简称提取的地区不要;优化表头内容连接处理

lsm 1 anno fa
parent
commit
8faaeaea46

+ 7 - 7
BiddingKG/dl/common/Utils.py

@@ -935,21 +935,21 @@ def money_process(money_text, header):
     money_text = re.sub('\s', '', money_text) # 2024/04/19 修复 457699044 556.46751 万元 金额与单位有空格造成万漏提取
     re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?", money_text)
     if re_price:
-        money_text = re_price.group(0)
-        if re.search('万元|[((]万[))]',  header) and '万' not in money_text:  # 修复37797825 控制价(万)
-            money_text += '万元'
+        money_re = re_price.group(0)
+        if (re.search('万元|[((]万[))]',  header) or re.search('万元|[((]万[))]', money_text)) and '万' not in money_re:  # 修复37797825 控制价(万) # 修复 460307391 万元不在表头,在数字前面
+            money_re += '万元'
         # money = float(getUnifyMoney(money_text))
-        money = float(getUnifyMoney(money_text))
+        money = float(getUnifyMoney(money_re))
         if money > 10000000000000:  # 大于万亿的去除
             money = 0
-        money_unit = '万元' if '万' in money_text else '元'
+        money_unit = '万元' if '万' in money_re else '元'
     return (money, money_unit)
 
 package_number_pattern = re.compile(
         '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
 |(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\
 |(([,;。、:(]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
-|((标[段包项]|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
+|((标[段包项]|品目|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
 |[,;。、:(](标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
 |((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,9}[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{0,9})\
 |[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})')
@@ -971,7 +971,7 @@ def find_package(content):
         content = content.replace(it.group(0), ' ' * len(it.group(0)))
 
     for iter in re.finditer(package_number_pattern, content):
-        if re.search('(业绩|信誉要求):', content[:iter.start()]):  # 前面有业绩或信誉的标段去掉
+        if re.search('(业绩|信誉要求):|业绩(如下)?\d*[、:]', content[:iter.start()]):  # 前面有业绩或信誉的标段去掉
             continue
         # print('提取到标段:%s, 前后文:%s' % (iter.group(), content[iter.start() - 5:iter.end() + 5]))
         if re.match('\d', iter.group(0)) and re.search('\d\.$', content[:iter.start()]):  # 排除2.10标段3  5.4标段划分 这种情况

+ 3 - 3
BiddingKG/dl/interface/Preprocessing.py

@@ -963,7 +963,7 @@ def tableToText(soup, docid=None):
     #根据表格处理方向生成句子,        
     def getTableText(inner_table,head_list,key_direct=False):
         # packPattern = "(标包|[标包][号段名])"
-        packPattern = "(标包|标的|标项|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则,补充采购类包名
+        packPattern = "(标包|标的|标项|品目|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则,补充采购类包名
         rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标|推荐意见|评标情况|推荐顺序|选取(情况|说明))"  # 2020/11/23 大网站规则,添加序号为排序
         entityPattern = "((候选|[中投]标|报价)(单位|公司|人|供应商))|供应商名称"
         moneyPattern = "([中投]标|报价)(金额|价)"
@@ -1079,7 +1079,7 @@ def tableToText(soup, docid=None):
 
                                 cell = table_occurence[i][j]
                                 head = (cell["top_head"]+":") if len(cell["top_head"])>0 else ""
-                                if re.search("单报标限总]价|金额|成交报?价|报价", head):
+                                if re.search("单报标限总]价|金额|成交报?价|报价|供应商|候选人|中标人", head):
                                     head = cell["left_head"] + head
                                 else:
                                     head += cell["left_head"]
@@ -2745,7 +2745,7 @@ def del_tabel_achievement(soup):
                 td_text = td.text.strip()
                 if len(td_text) > 25:
                     break
-                if len(td_text) < 25 and re.search('中标候选人|(项目|业绩|工程)名称|\w{,10}业绩$|合同金额|建设单位|采购单位|业主|甲方', td_text):
+                if len(td_text) < 25 and re.search('中标候选人|第[一二三四五1-5]候选人|(项目|业绩|工程)名称|\w{,10}业绩$|合同金额|建设单位|采购单位|业主|甲方', td_text):
                     _count += 1
                 if _count >=2:
                     pre_tag = tag.findPreviousSibling().extract()

+ 6 - 6
BiddingKG/dl/interface/extract.py

@@ -277,16 +277,16 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     cost_time["attrs"] = round(time.time()-start_time,2)
 
     '''表格要素提取'''
-    table_prem = predictor.getPredictor("tableprem").predict(text, nlp_enterprise, web_source_name)
+    table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise, web_source_name)
     # print('表格提取中标人:', table_prem)
     # print('原提取角色:', prem[0]['prem'])
     if table_prem:
-        getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem)
+        getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem, in_attachment=in_attachment)
 
     '''候选人提取'''
-    candidate_top3_prem, candidate_dic = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise)
+    candidate_top3_prem, candidate_dic, in_attachment = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise)
     # print('表格提取候选人:', candidate_top3_prem)
-    getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem)
+    getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem, in_attachment=in_attachment)
 
     '''获取联合体信息'''
     getAttributes.get_win_joint(prem, list_entitys, list_sentences, list_articles)
@@ -350,11 +350,11 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     # print(project_label)
 
     '''最终验证prem'''
-    getAttributes.confirm_prem(prem[0]['prem'])
+    getAttributes.confirm_prem(prem[0]['prem'], channel_dic)
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-04-26'}
+    version_date = {'version_date': '2024-04-29'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
 
     '''最终检查修正招标、中标金额'''

+ 37 - 13
BiddingKG/dl/interface/getAttributes.py

@@ -452,7 +452,8 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
     pack = getPackagesFromArticle(list_sentence,list_entity)
     if pack is None:
         return None
-    PackageList,PackageSet,dict_PackageCode = pack
+    # PackageList,PackageSet,dict_PackageCode = pack
+    PackageList,PackageSet,dict_PackageCode,main_body_pack = pack
 
     #拿到所有可能的情况
     dict_role_combination = {}
@@ -568,7 +569,7 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
                     _find_flag = True
             if not _find_flag:
                 _entity.pointer_pack = None
-    return RoleList,RoleSet,PackageList,PackageSet,win_tenderer_set,tenderee_or_agency_set
+    return RoleList,RoleSet,PackageList,PackageSet,win_tenderer_set,tenderee_or_agency_set,main_body_pack
 
 def getPackageScopePattern():
     '''
@@ -600,6 +601,7 @@ def getPackagesFromArticle(list_sentence, list_entity):
     PackageList_scope = []
     PackageSet = set()
     dict_packageCode = dict()
+    main_body_pack = set()  # 2024/04/28 保存正文包号
 
     # package_number_pattern =  re.compile(
     # '((施工|监理|监测|勘察|设计|劳务)(标段)?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{,4}(标段?|包))|(([a-zA-Z]包[:)]?)?第?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{1,4}标[段包]?)|((标[段号的包项]|([标分子]|合同|项目|采购|()包|包[组件号])[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦA-Za-z]{1,4})|(([,;。、:(]|第)[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}分?包)|([a-zA-Z][0-9]{,3}分?[包标])|.{,1}((包组|包件|包号|分?包|标[段号的包]|子项目)编?号?[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]+)|[,;。、:(]包[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\w]')  # 标号
@@ -724,6 +726,8 @@ def getPackagesFromArticle(list_sentence, list_entity):
                 if code is not None:
                     dict_packageCode[temp_package_number] = code
                 PackageSet.add(temp_package_number)
+                if not list_sentence[i].in_attachment: # 保存不在附件的包号
+                    main_body_pack.add(temp_package_number)
 
             # 识别packageScope
             for iter in re.finditer(pattern_packageScope, content):
@@ -772,6 +776,9 @@ def getPackagesFromArticle(list_sentence, list_entity):
                     if code is not None:
                         dict_packageCode[temp_package_number] = code
                     PackageSet.add(temp_package_number)
+                    if not list_sentence[i].in_attachment:  # 保存不在附件的包号
+                        main_body_pack.add(temp_package_number)
+
                 # 识别packageScope
                 for iter in re.finditer(pattern_packageScope, content):
                     PackageList_item_scope.append({"name": "", "sentence_index": list_sentence[i].sentence_index,
@@ -860,7 +867,7 @@ def getPackagesFromArticle(list_sentence, list_entity):
     PackageList = get_package_scope(PackageList_scope)
     # if len(PackageSet)<2: # 20230922只提取到一个包号的去掉,都放在默认包project 2024/02/02 注释掉,防止多标段每篇公告只公布一个标段的没法提取标段号
         # return [], set(), {}
-    return PackageList, PackageSet, dict_packageCode
+    return PackageList, PackageSet, dict_packageCode, main_body_pack
 
 
 # km配对方法
@@ -2891,7 +2898,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
     return PackDict
 
-def initPackageAttr(RoleList,PackageSet,win_tenderer_set,tenderee_or_agency_set):
+def initPackageAttr(RoleList,PackageSet,win_tenderer_set,tenderee_or_agency_set, main_body_pack):
     '''
     @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
     '''   
@@ -2899,6 +2906,7 @@ def initPackageAttr(RoleList,PackageSet,win_tenderer_set,tenderee_or_agency_set)
     packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
     for item in list(PackageSet):
         packDict[item] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
+        packDict[item]['in_attachment'] = False if item in main_body_pack else True
     for item in RoleList:
         if packDict[item.packageName]["code"] =="":
             packDict[item.packageName]["code"] = item.packageCode
@@ -2919,13 +2927,13 @@ def getPackageRoleMoney(list_sentence,list_entity,list_outline):
     if not theRole:
         return []
     # RoleList,RoleSet,PackageList,PackageSet = theRole
-    RoleList,RoleSet,PackageList,PackageSet,win_tenderer_set,tenderee_or_agency_set = theRole
+    RoleList,RoleSet,PackageList,PackageSet,win_tenderer_set,tenderee_or_agency_set,main_body_pack = theRole
     '''
     for item in PackageList:
         # print(item)
     '''
     # PackDict = initPackageAttr(RoleList, PackageSet)
-    PackDict = initPackageAttr(RoleList, PackageSet, win_tenderer_set,tenderee_or_agency_set)
+    PackDict = initPackageAttr(RoleList, PackageSet, win_tenderer_set,tenderee_or_agency_set,main_body_pack)
 
     PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline)
     return PackDict
@@ -3965,7 +3973,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                     moneys.append(money)
     return {'moneys': list(set(moneys)), 'moneys_attachment': list(set(moneys_attachment))}
 
-def update_prem(old_prem, new_prem):
+def update_prem(old_prem, new_prem, in_attachment=False):
     '''
     根据新旧对比,更新数据
     :param old_prem:
@@ -3974,7 +3982,7 @@ def update_prem(old_prem, new_prem):
     '''
     if len(new_prem) >= 1 :
         '''如果表格提取的包大于2,原来的包比表格提取的包多则删除原来多余的包,以表格的为准'''
-        if len(new_prem) >= 2 and len(old_prem) <= len(new_prem)*2:
+        if len(new_prem) >= 2 and len(new_prem)<len(old_prem) <= len(new_prem)*2:
             del_k = []
             for k in old_prem:
                 if k not in new_prem and k != 'Project':
@@ -3982,9 +3990,25 @@ def update_prem(old_prem, new_prem):
             for k in del_k:
                 old_prem.pop(k)
 
-        if len(old_prem) == 2 and len(new_prem) == 1 and 'Project' in new_prem and 'Project' in old_prem:
-            k = list(old_prem.keys()-new_prem.keys())[0]
-            new_prem[k] = new_prem.pop('Project')
+        if len(old_prem) > len(new_prem) and in_attachment==False: # 如果表格有提取,非表格包数比表格提取多,去掉非表格在附件里提取的包
+            del_k = []
+            for k in old_prem:
+                if 'in_attachment' in old_prem[k] and old_prem[k]['in_attachment'] and k not in new_prem and k != 'Project':
+                    del_k.append(k)
+            for k in del_k:
+                old_prem.pop(k)
+
+        # if len(new_prem) > len(old_prem) and [k for k in new_prem if '自增' not in k] == []:  # 如果表格提取包号都为自增编号且包数大于非表格提取,不进行更新 例 244355092  281854766
+        #     return None
+
+        if len(old_prem) == 2 and len(new_prem) == 1 and ('Project' in new_prem or set(new_prem)&set(old_prem)==set()): # 如果表格提取包为Project,非表格提取两个包且一个包为Project,把表格提取合并到非Project包
+            k = list(old_prem.keys()-set(['Project']))[0]
+            k_new = list(new_prem.keys())[0]
+            new_prem[k] = new_prem.pop(k_new)
+
+        if len(new_prem) == len(old_prem) == 1 and 'Project' not in new_prem and 'Project' in old_prem: # 如果表格提取到包号,非表格没提取到,合并到Project
+            k = list(new_prem.keys())[0]
+            new_prem['Project'] = new_prem[k]
 
         multi_tendereeMoney = [] # 多包招标金额
         for k, v in new_prem.items():
@@ -4039,7 +4063,7 @@ def update_prem(old_prem, new_prem):
 
     # return old_prem
 
-def  confirm_prem(prem):
+def  confirm_prem(prem, channel_dic):
     '''
     规则检查纠正prem,如果Project包中标人在其他包中标人,去掉project包中标角色;如果有其他包中标人,去掉roleList为空的包;
     :param prem: prem 字段字典
@@ -4062,7 +4086,7 @@ def  confirm_prem(prem):
             prem['Project']['roleList'] = [d for d in prem['Project']['roleList'] if
                                                d['role_name'] not in ['win_tenderer', 'second_tenderer',
                                                                       'third_tenderer']]
-        if other_winner:
+        if other_winner and channel_dic['docchannel']['docchannel'] in ['中标信息', '候选人公示', '合同公告']:
             for k in empty_roleList:
                 prem.pop(k)
 

+ 1 - 1
BiddingKG/dl/interface/modelFactory.py

@@ -89,7 +89,7 @@ class Model_role_classify_word():
         '''
         text = re.sub('第[一二三1-3]([条项章]|中学|医院|附属)|第三方(服务机构)?', 'xxx', text)
         text = re.sub('第01(中标|成交)?候选人', '第一中标候选人', text)
-        text = re.sub('标段[一二三1-3]', '标段d', text)
+        text = re.sub('([的包项]|品目)[一二三1-3]', '标段', text)
         text = re.sub('第?[一二三1-3](标段?|[分子标]?包)', 'd标段', text)
         text = re.sub('[a-zA-Z][a-zA-Z0-9=&_—-]{3,}', 'abc', text)
         text = re.sub('[【(\[][0-9]{2,}[\])】]|\d+([::.-]\d+)+', 'd', text)

+ 57 - 13
BiddingKG/dl/interface/predictor.py

@@ -5686,6 +5686,8 @@ class DistrictPredictor():
             province_l = find_areas(p_pro, text)
             city_l = find_areas(p_city, text)
             district_l = find_areas(p_dis, text)
+            if len(province_l) == len(city_l) == 0:
+                district_l = [it for it in district_l if re.search('[市县旗区]$', it[0])]  # 20240428去掉只有区县地址且不是全称的匹配,避免错误 例 凌云工业股份有限公司 提取地区为广西白色凌云
 
             province_l = chage_area2score(province_l, max_len=len(text))
             city_l = chage_area2score(city_l, max_len=len(text))
@@ -5913,10 +5915,11 @@ class DistrictPredictor():
 
         project_name = project_name + title if project_name not in title else title
         # project_name = project_name.replace(tenderee, '')
-        entity_list = getNers([project_name],useselffool=False) # 2024/4/26 修改为去重项目名称中所有公司名称
-        for tup in entity_list[0]:
-            if tup[2] in ['org', 'company']:
-                project_name = project_name.replace(tup[3], '')
+        if len(project_name)>3:
+            entity_list = getNers([project_name],useselffool=False) # 2024/4/26 修改为去重项目名称中所有公司名称
+            for tup in entity_list[0]:
+                if tup[2] in ['org', 'company']:
+                    project_name = project_name.replace(tup[3], '')
 
         text1 = "{0} {1} {2}".format(tenderee, tenderee_address, project_name)
 
@@ -6066,7 +6069,7 @@ class TablePremExtractor(object):
         self.head_rule_dic = {
             'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
-            "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
+            "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
             "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序",
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
@@ -6182,6 +6185,7 @@ class TablePremExtractor(object):
         multi_same_package = False # 非连续的重复包号
         package_fix2raw = dict()  # 处理后包号:处理前包号 字典
         link_set = set()
+        tenderer_list = [] # 保存所有中标人
         not_package = True if 'project_name' in headers and re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1]) and \
                           'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False
 
@@ -6189,6 +6193,7 @@ class TablePremExtractor(object):
             or re.search('(货物|商品|产品|设备|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683;  补充避免423647863采购意向被过滤
             # print('没有包号及角色的不要')
             return {}
+
         for i in df.index:
             same_package = False  # 连续重复包号,一般是 rowspan 造成;一包 多个采购
             project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
@@ -6260,9 +6265,9 @@ class TablePremExtractor(object):
 
             if project_code != "":
                 uni_project_code= uniform_package_name(project_code)
-                if uni_project_code != "" and package != "":
+                if uni_project_code != "" and package != "" and uni_project_code!=package:
                     # print('重组包号:', '%s_%s'%(uni_project_code, package))
-                    package = '%s_%s'%(uni_project_code, package)
+                    package = '%s_%s'%(uni_project_code, package.replace('自增', ''))
             if package_code_raw!='':
                 if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
                     package_fix2raw[package] = package_code_raw
@@ -6341,6 +6346,7 @@ class TablePremExtractor(object):
                         "role_text": tenderer,
                         "serviceTime": ""
                 })
+                tenderer_list.append(tenderer)
             if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的 丢弃 并不再继续往下匹配
                 prem_dic.pop(package)
                 # break # 注释掉避免 400084571 某些包废标 中断匹配
@@ -6348,6 +6354,32 @@ class TablePremExtractor(object):
                 for k, v in package_fix2raw.items():
                     if k in prem_dic:
                         prem_dic[v] = prem_dic.pop(k)
+        if len(tenderer_list)>2 and len(set(tenderer_list))==1 and "package_code" not in headers: # 没提取到包号且中标人一样应该是错误多包,需去掉多包 例 244355092  281854766
+            total_money = 0
+            for v in prem_dic.values():
+                for d in v['roleList']:
+                    if d['role_name'] == "win_tenderer":
+                        total_money += d['role_money']['money']
+            return {'自增1': {
+                'code': '',
+                'name': '',
+                'roleList': [{
+                        "address": "",
+                        "linklist": [],
+                        "role_money": {
+                            "discount_ratio": "",
+                            "downward_floating_ratio": "",
+                            "floating_ratio": "",
+                            "money": total_money,
+                            "money_unit": ''
+                        },
+                        "role_name": "win_tenderer",
+                        "role_text": tenderer_list[0],
+                        "serviceTime": ""
+                }],
+                'tendereeMoney': 0,
+                'tendereeMoneyUnit': ""
+            }}
         return prem_dic
 
     def update_prem(self, rs_dic, tmp_dic):
@@ -6417,11 +6449,11 @@ class TablePremExtractor(object):
                         self.update_prem(table_prem, prem_)
                     i = j - 1
                 i += 1
-            if table_prem and len(trs) == 2 and 'package_code' not in headers and '1' in table_prem and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
+            if table_prem and len(trs) == 2 and 'package_code' not in headers and '自增1' in table_prem and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
                 sib = table.find_previous_sibling()
                 sib_text = sib.get_text()
                 ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}', sib_text)
-                if sib.name in ['p', 'div'] and len(sib_text)<30 and ser_sib:
+                if sib.name in ['p', 'div'] and len(sib_text)<100 and ser_sib:
                     package_sib = ser_sib.group(0)
                     package_sib = uniform_package_name(package_sib)
                     table_prem[package_sib] = table_prem.pop('自增1')
@@ -6437,16 +6469,18 @@ class TablePremExtractor(object):
         soup = BeautifulSoup(html, 'lxml')
         richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
         self.nlp_enterprise = nlp_enterprise
+        in_attachment = False
         if richText:
             richText = richText.extract()  # 过滤掉附件
         prem = self.get_prem(soup, web_source_name)
         if prem == {} and richText:
             prem = self.get_prem(richText, web_source_name)
+            in_attachment = True
         if len(prem) == 1:  # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
             k = list(prem)[0]
-            if k == '1' or len(k) > 2:
+            if k.startswith('自增'):
                 prem['Project'] = prem.pop(k)
-        return prem
+        return prem, in_attachment
 
 class CandidateExtractor(object):
     def __init__(self):
@@ -6719,7 +6753,6 @@ class CandidateExtractor(object):
         candidate_set = set()
         for table in tables:
             trs = self.tb.table2list(table)
-            table.extract()
             i = 0
             headers = ""
             while i < len(trs) - 1:
@@ -6745,6 +6778,15 @@ class CandidateExtractor(object):
                         candidate_set.update(candidate_set_)
                     i = j - 1
                 i += 1
+            if rs_dic and 'package_code' not in headers and 'Project' in rs_dic and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
+                sib = table.find_previous_sibling()
+                sib_text = sib.get_text()
+                ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}', sib_text)
+                if sib.name in ['p', 'div'] and len(sib_text)<100 and ser_sib:
+                    package_sib = ser_sib.group(0)
+                    package_sib = uniform_package_name(package_sib)
+                    rs_dic[package_sib] = rs_dic.pop('Project')
+            table.extract()
         return rs_dic, candidate_set
 
     def get_candidates_from_text(self, list_sentences, list_entitys):
@@ -6772,14 +6814,16 @@ class CandidateExtractor(object):
         html = re.sub("##attachment##","",html)
         soup = BeautifulSoup(html, 'lxml')
         richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
+        in_attachment = False
         if richText:
             richText = richText.extract()  # 过滤掉附件
         prem, candidate_set = self.get_prem(soup)
         if prem == {} and richText:
             prem, candidate_set = self.get_prem(richText)
+            in_attachment = True
         if prem == {} and candidate_set == set():
             candidate_set = self.get_candidates_from_text(list_sentences, list_entitys)
-        return prem, {'candidate': ','.join(candidate_set)}
+        return prem, {'candidate': ','.join(candidate_set)}, in_attachment
 
 def role_special_predictor(web_source_name, content, nlp_enterprise):
     if web_source_name == '中国电子科技集团有限公司电子采购平台':