Преглед изворни кода

修复反馈问题;新增审批公告表格多项目提取;

lsm пре 3 месеци
родитељ
комит
339570f104

+ 10 - 10
BiddingKG/dl/common/Utils.py

@@ -984,7 +984,7 @@ package_number_pattern = re.compile(
 |(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\
 |(([,;。、:(]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
 |((标[段包项]|品目|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
-|([,;。、:(]|^)(标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
+|([,;。、:(]|^)(标的?|(招标|采购)?项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
 |((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,9}[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{0,9})\
 |[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})')
 filter_package_pattern =  'CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\
@@ -1171,15 +1171,15 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
 
     # sentence_text = re.sub('\d+[年月日]', '', sentence_text) # 修复560180018 中标价(元):3年投标报价(元)含税6299700.00 3年作为金额
 
-    if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
-        found_yeji += 1
-    if found_yeji >= 2:  # 过滤掉业绩后面的所有金额
-        all_match = []
-    else:
-        ser = re.search('((收费标准|计算[方公]?式):|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s()()\[\]【】\d\.%%‰\+\-*×/]{20,}[,。]?', sentence_text)  # 过滤掉收费标准里面的金额
-        if ser:
-            sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
-        all_match = re.finditer(pattern_money, sentence_text)
+    # if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
+    #     found_yeji += 1
+    # if found_yeji >= 2:  # 过滤掉业绩后面的所有金额 # 20250210修复逻辑错误,中标金额被前面句子业绩表达过滤 评分因素:业绩(9分),评分标准:提供2021年1月1日以来类似项目业绩, 589003579
+    #     all_match = []
+    # else:
+    ser = re.search('((收费标准|计算[方公]?式):|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s()()\[\]【】\d\.%%‰\+\-*×/]{20,}[,。]?', sentence_text)  # 过滤掉收费标准里面的金额
+    if ser:
+        sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
+    all_match = re.finditer(pattern_money, sentence_text)
     # print('all_match:', all_match)
     for _match in all_match:
         # print('_match: ', _match.group())

+ 4 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -3364,6 +3364,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = article_processed.replace('(', '(').replace(')', ')')  #2022/8/10 统一为中文括号
         article_processed = article_processed.replace('侯选人', '候选人')  #2024/09/03 修复错别字避免预测错误。
         article_processed = article_processed.replace('人选人', '入选人')  #2024/09/03 修复错别字避免预测错误。
+        article_processed = article_processed.replace('⺠', '民')  # 2025/02/17 修复错别字 例:连云港市第一人⺠医院。
         # article_processed = article_processed.replace(':', ':')  #2023/1/5 统一为中文冒号
         article_processed = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])", ":", article_processed)
         article_processed = article_processed.replace('.','.').replace('-', '-') # 2021/12/01 修正OCR识别PDF小数点错误问题
@@ -3416,6 +3417,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub('四舍五入至', '', article_processed) # 修复 533537050 ,中标价(四舍五入至万元):6468万元
         if re.search('推荐供应商:', article_processed) and re.search('入围供应商:', article_processed): # 修复 中国工商银行 类网站 入围的才算中标
             article_processed = article_processed.replace('推荐供应商:', '公司名称:')
+        if web_source_no.startswith('DX016489') and re.search('排名', article_processed) and re.search('成交供应商单位名称', article_processed): # 20250219 处理特殊站源有排名却叫成交供应商
+            article_processed = article_processed.replace('成交供应商单位名称', '成交候选人单位名称')
 
         '''去除业绩内容'''
         article_processed = del_achievement(article_processed)
@@ -3653,7 +3656,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
     '''
 
     list_entitys = []
-    not_extract_roles = ['黄埔军校', '国有资产管理处', '五金建材', '铝合金门窗', '华电XX发电有限公司', '华电XXX发电有限公司', '中标(成交)公司', '贵州茅台', '贵州茅台酒'] # 需要过滤掉的企业单位
+    not_extract_roles = ['黄埔军校', '国有资产管理处', '五金建材', '铝合金门窗', '华电XX发电有限公司', '华电XXX发电有限公司', '中标(成交)公司', '贵州茅台', '贵州茅台酒', '陕西省省级国'] # 需要过滤掉的企业单位
     for list_sentence in list_sentences:
         sentences = []
         list_entitys_temp = []

+ 4 - 3
BiddingKG/dl/interface/extract.py

@@ -485,14 +485,15 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2025-01-17'}
+    version_date = {'version_date': '2025-02-19'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:
-        approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys, text)
+        approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys, text, nlp_enterprise=nlp_enterprise+nlp_enterprise_attachment)
         approval = predictor.getPredictor("approval").add_ree2approval(approval , prem[0]['prem'])
+        approval = predictor.getPredictor("approval").add_codename2approval(approval , codeName)
         data_res['prem'] = {}  # 审批项目不要这项
-        data_res['approval'] = approval
+        data_res['approval'] = approval[:100] # 20250217 限制获取最多100个项目
 
     if channel_dic['docchannel']['doctype'] == '处罚公告': # 20240627 处罚公告进行失信要素提取
         start_time = time.time() #失信数据要素提取

+ 1 - 1
BiddingKG/dl/interface/kvtree_search.py

@@ -11,7 +11,7 @@ import re
 requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
                           "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
                       "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([::,]|$)"
-aptitude_pattern = "资质(资格)要求|资格(资质)要求|单位要求|资质及业绩要求|((资格|资质|准入)[的及]?(要求|条件|标准|限定|门槛)|竞买资格及要求|供应商报价须知)|按以下要求参与竞买|((报名|应征|竞买|投标|竞投|受让|报价|竞价|竞包|竞租|承租|申请|参与|参选|遴选)的?(人|方|单位|企业|客户|机构)?|供应商|受让方)((必?须|需|应[该当]?)(具备|满足|符合|提供)+以?下?)?的?(一般|基本|主要)?(条件|要求|资格(能力)?|资质)+|乙方应当符合下列要求|参与比选条件|合格的投标人|询价要求"
+aptitude_pattern = "资质(资格)要求|资格(资质)要求|单位要求|资质及业绩要求|((资格|资质|准入)[的及]?(要求|条件|标准|限定|门槛)|竞买资格及要求|供应商报价须知)|按以下要求参与竞买|((报名|应征|竞买|投标|竞投|受让|报价|竞价|竞包|竞租|承租|申请|参与|参选|遴选)的?(人|方|单位|企业|客户|机构)?|供应商|受让方)((必?须|需|应[该当]?)(具备|满足|符合|提供)+以?下?)?的?(一般|基本|主要)?(条件|要求|资格(能力)?|资质)+|乙方应当符合下列要求|参与比选条件|合格的投标人|询价要求|项目要求"
 
 pinmu_name_pattern = "采购品目(名称)?([::,]|$)"
 addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)"

+ 247 - 27
BiddingKG/dl/interface/predictor.py

@@ -1057,10 +1057,10 @@ class PREMPredict():
                     if _entity.entity_type == "money" and _entity.notes == '招标或中标金额' and _entity.label == 2:
                         # if channel_dic['docchannel'] == "招标公告":
                         if re.search('中标|成交|中选|中价|中租|结果|入围', title + list_articles[0].content[:100]) == None:
-                            _entity.values[0] = 0.51
+                            _entity.values[0] = 0.55
                             _entity.set_Money(0, _entity.values)  # 2021/11/18 根据公告类别把费用改为招标或中投标金额
                         else:
-                            _entity.values[1] = 0.51
+                            _entity.values[1] = 0.55
                             _entity.set_Money(1, _entity.values)
 
     def predict(self,list_sentences,list_entitys):
@@ -1486,7 +1486,7 @@ class RoleRulePredictor():
 
         self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商|银行)))|" \
                                          "^((报价|价格)最低,|以\w{5,10})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
-                                         "|^:贵公司参与|^:?你方于|^(胜出)?中标。|^取得中标(单位)?资格|^以\d+[\d,.]+万?元(中标|成交|中选)" \
+                                         "|^:贵公司参与|^:?你方于|^(胜出)?(中标|成交)[,]|^取得中标(单位)?资格|^以\d+[\d,.]+万?元(中标|成交|中选)" \
                                          "|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))]))" # 去掉 |\w{,20} 修复 460216955 网上公布的与本次采购项目有关的信息视为已送达各响应供应商。 作为中标
         self.pattern_winTenderer_whole = "(?P<winTenderer_center>(贵公司|由).{,15}以\w{,15}中标|确定[\w()]{5,20}为[^,。;]{5,50}的?中标单位" \
                                          "|选定报价最低的[“”\w()]{5,25}为[^,。;]{5,50}的?(服务|中标|成交)单位" \
@@ -3542,6 +3542,8 @@ class ProductAttributesPredictor():
                                 i += 1
                                 # print('过滤:产品单价包含金额外的字符数大于5个',  tds[id3])
                                 continue
+                            else:
+                                unitPrice = tds[id3]
                         if id4 != "":
                             if re.search('\w', tds[id4]):
                                 brand = tds[id4]
@@ -5867,7 +5869,7 @@ class DistrictPredictor():
         text = str(text).replace('(', '(').replace(')', ')')
         text = re.sub('\d{2,4}年度?|[\d/-]{1,5}[月日]|\d+|[a-zA-Z0-9]+', ' ', text)
         text = re.sub(
-            '复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城|西九龙站|广州路北|安阳山村', # 570445994 广州路北侧 预测为 广州 路北
+            '复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城|西九龙站|广州路北|安阳山村|电信|联通|北京现代', # 570445994 广州路北侧 预测为 广州 路北
             ' ', text)  # 544151395 赤壁市老城区燃气管道老化更新改造
         text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
         text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域:怒江州 识别为广西 - 崇左 - 江州
@@ -5900,18 +5902,18 @@ class DistrictPredictor():
                                 score = 2
                             else:
                                 score = 1
-                                if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
+                                if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站|地区|区域)'
                                         , text[it.end(k):]) or re.search('^((%s)|\-%s)' % (v, v),
                                                                          text[max(0, it.start(k) - 1):]):
                                     score += 1
-                            # score += it.end(k) / len(text) / 10
+                            score += it.end(k) / len(text) / 10
                             province_l.append((v, score * weight))
                         elif k in ['city', 'city1']:
                             if v in full_dic['city']:
                                 score = 2
                             else:
                                 score = 1
-                                if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
+                                if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站|地区|区域)'
                                         , text[it.end(k):]) or re.search('^((%s)|\-%s)' % (v, v),
                                                                          text[max(0, it.start(k) - 1):]):
                                     score += 1
@@ -5924,7 +5926,7 @@ class DistrictPredictor():
                                 score = 2
                             else:
                                 score = 0.5
-                                if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
+                                if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站|地区|区域)'
                                         , text[it.end(k):]) or (
                                         re.match('\s*%s' % v, text) and it.start(k) < 2) or re.search(
                                     '^((%s)|\-%s)' % (v, v), text[max(0, it.start(k) - 1):]):
@@ -6131,12 +6133,14 @@ class DistrictPredictor():
         addr_bidsend = addr_dic.get('addr_bidsend', '')
         addr_contact = addr_dic.get('addr_contact', '')
         in_content = False
+        not_sure = True # 是否不确定地区
         province_l, city_l, district_l = self.find_whole_areas('%s %s'%(title, addr_project), self.pettern, self.area_variance_dic, self.full_dic)
         pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
-        big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
+        big_area_1, pred_pro_1, pred_city_1, pred_dis_1, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
+        big_area, pred_pro, pred_city, pred_dis = big_area_1, pred_pro_1, pred_city_1, pred_dis_1
         # print('关键词1:', province_l, city_l, district_l)
         # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
-        if pred_city == "" or prob < 0.7 or max_score<2:
+        if pred_city_1 == "" or prob < 0.7 or max_score<2:
             ree, addr = self.get_ree_addr(prem)
             if ree in title:
                 ree = '##'
@@ -6151,27 +6155,33 @@ class DistrictPredictor():
             city_l.extend(city_l2)
             district_l.extend(district_l2)
             pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
-            big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
+            big_area_2, pred_pro_2, pred_city_2, pred_dis_2, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
+            big_area, pred_pro, pred_city, pred_dis = big_area_2, pred_pro_2, pred_city_2, pred_dis_2
             # print('关键词2:', province_l, city_l, district_l)
             # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
-            if pred_city == "" or prob < 0.7 or max_score<2:
+            if re.search('省|市|自治', addr_project) and pred_pro_1 != '' and pred_pro_1 != pred_pro_2: # 如果有项目地址使用项目地址
+                not_sure = False
+                big_area, pred_pro, pred_city, pred_dis = big_area_1, pred_pro_1, pred_city_1, pred_dis_1
+            if not_sure and (pred_city_2 == "" or prob < 0.7 or max_score<2):
                 province_l3, city_l3, district_l3 = self.find_whole_areas('%s %s'%(addr_bidopen, addr_bidsend), self.pettern, self.area_variance_dic, self.full_dic, weight=0.6)
                 province_l.extend(province_l3)
                 city_l.extend(city_l3)
                 district_l.extend(district_l3)
                 pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
-                big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
+                big_area_3, pred_pro_3, pred_city_3, pred_dis_3, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
+                big_area, pred_pro, pred_city, pred_dis = big_area_3, pred_pro_3, pred_city_3, pred_dis_3
                 # print('关键词3:', province_l, city_l, district_l)
                 # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
-                if pred_city == "" or prob < 0.6 or max_score < 2:
+                if not_sure and (pred_city_3 == "" or prob < 0.6 or max_score < 2):
                     all_addr, tenderees = self.get_all_addr(list_entity)
                     province_l4, city_l4, district_l4 = self.find_whole_areas('%s %s %s' % (web_source_name, tenderees, all_addr), self.pettern, self.area_variance_dic, self.full_dic, weight=0.3)
                     province_l.extend(province_l4)
                     city_l.extend(city_l4)
                     district_l.extend(district_l4)
                     pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
-                    big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids,dis_ids, self.idx_dic)
-                    if prob < 0.6 or max_score < 4:
+                    big_area_4, pred_pro_4, pred_city_4, pred_dis_4, prob, max_score = self.get_final_addr(pro_ids, city_ids,dis_ids, self.idx_dic)
+                    big_area, pred_pro, pred_city, pred_dis = big_area_4, pred_pro_4, pred_city_4, pred_dis_4
+                    if pred_pro_3 != pred_pro_4 and (prob < 0.6 or max_score < 2):
                         in_content = True
                     # print('关键词4:', province_l, city_l, district_l)
                     # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
@@ -6819,7 +6829,7 @@ class TablePremExtractor(object):
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
             "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
             "win_sort": "排名|排序|名次|推荐顺序",
-            'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因|中标情况',
+            'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因|中标情况|^中标结果$',
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟?推荐(入选|入围)?)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
@@ -6920,6 +6930,11 @@ class TablePremExtractor(object):
                     header_dic['tenderer'] = other_tenderer2
                     if 'win_sort' not in header_dic:
                         not_sure_winner = True
+            elif 'tenderer' not in header_dic and 'win_or_not' in header_dic:
+                if other_tenderer!="":
+                    header_dic['tenderer'] = other_tenderer
+                elif other_tenderer2!="":
+                    header_dic['tenderer'] = other_tenderer2
             if all_winner == 1 and 'win_sort' in header_dic: # 标题有存管类公告不分排名
                 header_dic.pop('win_sort')
             if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
@@ -7338,7 +7353,7 @@ class CandidateExtractor(object):
             "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)|^标的$",
             "win_sort": "排名|排序|名次|推荐顺序",
             'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论|^选择设备$', # 补充站源特别表达:例:577351909 选择设备 1 为中标 0 非中标
-            "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位|^公司名称$", #补充 368295593 投标个人/单位 提取
+            "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位|^公司名称$|供应商单位名称$", #补充 368295593 投标个人/单位 提取
             "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
             "win_tenderer": "第一名|第一(中标|成交)?候选人",
             "second_tenderer": "第二名|第二(中标|成交)?候选人",
@@ -7374,6 +7389,8 @@ class CandidateExtractor(object):
                     if re.search(v, text):
                         if k in ['candidate', 'win_tenderer', 'second_tenderer', 'third_tenderer']  and re.search('是否', text):
                             continue
+                        elif k == 'win_or_not' and re.search('是否(中标|成交)候选人', text): # 修复 584112560 把第二作第一错误
+                            continue
                         header_dic[k] = (i, text)
                         # if k != 'candidate': # candidate 可与前三候选重复
                         num += 1
@@ -7410,7 +7427,7 @@ class CandidateExtractor(object):
         :param nlp_enterprise: 公告中的角色实体列表
         :return:
         '''
-        text = re.sub('主报名人:|联合报名人:|联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
+        text = re.sub('主报名人:|联合报名人:|联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]|(联合体(牵头|成员)单位)'
                       , ',', text)
         text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
         text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '',
@@ -7827,11 +7844,11 @@ def get_header_line(list_item):
         x.append(getPredictor("form").encode(item))
     predict_y = getPredictor("form").predict(np.array(x), type="item")
     for item, values in zip(list_item, list(predict_y)):
-        item = str(item)
+        item = str(item).replace(' ', '')
         lb = 1 if values[1] > 0.5 else 0
-        if item in ['许可/同意', '办结(通过)', '办结(准予许可)','批准', '合格']:
+        if item in ['许可/同意', '办结(通过)', '办结(准予许可)','批准', '合格', '民间投资', '备案']:
             lb = 0
-        elif item in ['环境影响评价机构', '建设单位或地方政府作出的相关环保承诺'] or re.search('^比例\d{1,2}%$', item):
+        elif item in ['环境影响评价机构', '建设单位或地方政府作出的相关环保承诺', '环境影响评价技术服务机构', '报告全本'] or re.search('^比例\d{1,2}%$', item):
             lb = 1
         elif lb == 0 and item in header_set:
             lb = 1
@@ -7892,6 +7909,10 @@ class ApprovalPredictor():
             "total_tendereeMoney": "(项目|概算|投资)金额|项目投资|总投资|总预算|总概算|投资(规模|总额|估算|概算)|批复概算|投资额|项目概算", # 总投资
         }
 
+        self.head_rule_dic = {**self.role_type, **self.person_type, **self.date_type, **self.addr_type, **self.money_type}
+        self.head_rule_dic.update({k: v.split(':')[0] for k,v in self.other_part.items()})
+        self.tb = TableTag2List()
+
     def recursive_text(self, tag):
         '''
         递归获取 soup 节点文本
@@ -7914,8 +7935,190 @@ class ApprovalPredictor():
                     texts.append(re.sub('\s', '', child.strip().replace(':', ':').replace('(', '(').replace(')', ')')))
         return texts
 
-    def predict(self, list_sentences, list_entitys, html, span=12):
-        soup = BeautifulSoup(html)
+    def get_table_info(self, df, nlp_enterprise):
+        def get_header_index(datas):
+            '''
+            根据表格表头判断结果0/1 得到哪些行和列是表头
+            :param datas: 表格内容表头判断结果数据[[1,1,1,1],[0,0,0,0]]
+            :return: 表头所在的行和列序号
+            '''
+            header_row = []
+            header_col = []
+            df_h = pd.DataFrame(datas)  # 表头判断数据 , columns=columns
+            for i in df_h.index:
+                line = df_h.loc[i].values
+                if sum(line) == len(line):
+                    header_row.append((i, sum(line) / len(line)))
+                elif sum(line) / len(line) > 0.8:
+                    header_row.append((i, sum(line) / len(line)))
+                elif len(line) > 3 and len(re.findall('11', ''.join([str(it) for it in line]))) > len(
+                        re.findall('10', ''.join([str(it) for it in line]))):
+                    header_row.append((i, sum(line) / len(line)))
+            for i in df_h.columns:
+                col = df_h[i].values
+                if sum(col) == len(col):
+                    header_col.append((i, sum(col) / len(col)))
+                elif sum(col) / len(col) > 0.8:
+                    header_col.append((i, sum(col) / len(col)))
+                elif len(col) > 3 and len(re.findall('11', ''.join([str(it) for it in line]))) > len(
+                        re.findall('10', ''.join([str(it) for it in line]))):
+                    header_col.append((i, sum(col) / len(col)))
+            return header_row, header_col
+
+        def get_header(l, head_rule_dic):
+            header_dic = {}
+            for i in range(len(l)):
+                text = l[i].replace(' ', '') # 修复54969575 项目 名称 被空格分割
+                num = 0
+                tmp_dic = {}
+                for k, v in head_rule_dic.items():
+                    if re.search(v, text):
+                        tmp_dic[k] = i
+                        num += 1
+                for k, v in tmp_dic.items():
+                    if k not in header_dic:
+                        header_dic[k] = v
+            return header_dic
+
+        result_l = []
+        datas = []
+        for i in df.index:
+            line = get_header_line(df.loc[i].values)
+            datas.append(line)
+        header_row, header_col = get_header_index(datas)
+        if len(header_col) == 1 and header_col[0][0] > 1: # 列表头不可能在第1列后面开始
+            header_col = []
+        if len(header_row) >= 1 and len(header_col) == 0:  # 有行表头无列表头
+            i = 0
+            while i < len(header_row):
+                idx, ratio = header_row[i]
+                if idx + 1 >= len(df):
+                    break
+                header_dic = get_header(df.loc[idx].values, self.head_rule_dic)
+                i += 1
+                range_from = idx + 1
+                range_to = len(df)
+                if i < len(header_row):
+                    next_header = i
+                    for j in range(i, len(header_row)):
+                        idx2, ratio2 = header_row[j]
+                        if idx2 - idx == 1:
+                            header_dic2 = get_header(df.loc[idx2].values, self.head_rule_dic)
+                            if set(df.loc[idx].values) & set(df.loc[idx2].values) != set():
+                                header_dic.update(header_dic2)
+                            else:
+                                header_dic = header_dic2
+                            range_from = idx2 + 1
+                            range_to = len(df)
+                            next_header = j + 1
+                            idx = idx2
+                        else:
+                            range_from = idx + 1
+                            range_to = idx2
+                            next_header = j
+                            break
+                    i = next_header
+                if len(header_dic) >= 2 and 'project_name' in header_dic:
+                    for index in range(range_from, range_to):
+                        if len(set(df.loc[index, :])) <= 2: # 修复 56873031 补全内容跟表头错误连接
+                            continue
+                        tmp_dic = {}
+                        for k, v in header_dic.items():
+                            if k.startswith('time_'):
+                                content = timeFormat(df.loc[index, v], default_first_day=False) if k in [
+                                    'time_completion'] else timeFormat(df.loc[index, v])
+                            elif k in self.role_type:
+                                content = get_role(df.loc[index, v], nlp_enterprise)
+                            elif k == 'moneysource':
+                                content = turnMoneySource(df.loc[index, v])
+                            else:
+                                content = df.loc[index, v]
+                            if content != '':
+                                tmp_dic[k] = content
+                        if len(tmp_dic) > 1 and 'project_name' in tmp_dic and tmp_dic not in result_l:
+                            result_l.append(tmp_dic)
+        elif len(header_row) == 0 and len(header_col) >= 1:
+            return result_l # 不提取列向表格,容易出错 例 53489774 作多标段
+            i = 0
+            while i < len(header_col):
+                idx, ratio = header_col[i]
+                if idx + 1 >= len(df.columns):
+                    break
+                header_dic = get_header(df[idx].values, self.head_rule_dic)
+                i += 1
+                range_from = idx + 1
+                range_to = len(df.columns)
+                if i < len(header_col):
+                    next_header = i
+                    for j in range(i, len(header_col)):
+                        idx2, ratio2 = header_col[j]
+                        if idx2 - idx == 1:
+                            header_dic2 = get_header(df[idx2].values, self.head_rule_dic)
+                            if set(df[idx].values) & set(df[idx2].values) != set():
+                                header_dic.update(header_dic2)
+                            else:
+                                header_dic = header_dic2
+                            range_from = idx2 + 1
+                            range_to = len(df.columns)
+                            next_header = j + 1
+                            idx = idx2
+                        else:
+                            range_from = idx + 1
+                            range_to = idx2
+                            next_header = j
+                            break
+                    i = next_header
+                if len(header_dic) >= 2 and 'project_name' in header_dic:
+                    for index in range(range_from, range_to):
+                        if len(set(df.loc[:, index])) <= 2:
+                            continue
+                        tmp_dic = {}
+                        for k, v in header_dic.items():
+                            if k.startswith('time_'):
+                                content = timeFormat(df.loc[v, index], default_first_day=False) if k in [
+                                    'time_completion'] else timeFormat(df.loc[v, index])
+                            elif k in self.role_type:
+                                content = get_role(df.loc[v, index], nlp_enterprise)
+                            elif k == 'moneysource':
+                                content = turnMoneySource(df.loc[v, index])
+                            else:
+                                content = df.loc[v, index]
+                            if content != '':
+                                tmp_dic[k] = content
+                        if len(tmp_dic) > 2 and 'project_name' in tmp_dic and tmp_dic not in result_l:
+                            result_l.append(tmp_dic)
+        elif len(header_row) == 1 and len(header_col) == 1:
+            pass
+        return result_l
+
+    def predict_table(self, html, nlp_enterprise=[]):
+        html = re.sub("<html>|</html>|<body>|</body>", "", html)
+        html = re.sub("##attachment##", "", html)
+        soup = BeautifulSoup(html, 'lxml')
+        richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
+        self.nlp_enterprise = nlp_enterprise
+        if richText:
+            richText = richText.extract()  # 过滤掉附件
+        tables = soup.find_all('table')
+        if len(tables) == 0 and richText:
+            tables = richText.find_all('table')
+        tables.reverse()
+        data_list = []
+        for table in tables:
+            trs = self.tb.table2list(table)
+            if len(trs) > 1 and len(set(trs[0])) > 0 and len(set([len(tr) for tr in trs])) == 1: # 表格两行以上且每行列数一样才处理
+                df = pd.DataFrame(trs)
+                rs_l = self.get_table_info(df, nlp_enterprise)
+                for d in rs_l: # 53338603 项目名称+建设内容才是唯一
+                    if d not in data_list:
+                        data_list.append(d)
+                if rs_l:
+                    table.extract()
+        return data_list
+
+    def predict(self, list_sentences, list_entitys, html, nlp_enterprise=[], span=12):
+        tabel_rs = self.predict_table(html, nlp_enterprise)  # 表格多项目提取
+        soup = BeautifulSoup(html, 'lxml')
         texts_list = self.recursive_text(soup)
         rs_dic = {k: "" for k in
                   self.other_part.keys() | self.role_type.keys() | self.date_type.keys() | self.addr_type.keys() | self.money_type.keys() | self.person_type.keys()}
@@ -8067,6 +8270,15 @@ class ApprovalPredictor():
                 rs_l.append(multi_project)
         if not_sure_role != '' and rs_dic.get('construct_company', '') == '' and not_sure_role not in org_set: # 补充,单位名称:这种作为建设单位 例:400069851014
             rs_dic['construct_company'] = not_sure_role
+        if len(tabel_rs) > 1:
+            rs_dic_key = [k for k, v in rs_dic.items() if v != '']
+            keys = set(["approver", "publisher", "time_release", "phone", "doc_num"]) & set(rs_dic_key) - set(tabel_rs[0].keys())
+            if keys:
+                for d in tabel_rs:
+                    for k in keys:
+                        d[k] = rs_dic[k]
+            return tabel_rs
+
         if len(rs_l)>1 and len(set(rs_l[0].keys()))>2 and set(rs_l[0].keys())==set(rs_l[1].keys()):
             for k in self.role_type.keys(): # 多项目无建设单位等通过整篇提取补充
                 if rs_dic.get(k, '') != '' and k not in rs_l[0].get(k, '') == '':
@@ -8074,7 +8286,7 @@ class ApprovalPredictor():
                         if d.get(k, '') == '':
                             d[k] = rs_dic[k]
             return rs_l
-        elif found_key == 1:
+        if found_key == 1:
             district = getPredictor('district').get_area(
                 rs_dic['approver'] + rs_dic['project_name'] + rs_dic['project_addr'], '')
             if district['district']['province'] != '全国':
@@ -8144,6 +8356,14 @@ class ApprovalPredictor():
                     break
         return approval
 
+    def add_codename2approval(self, approval, codeName):
+        if len(approval) == 1 and codeName:  # 根据整个公告项目编号及名称补充审批信息
+            if 'project_code' not in approval[0] and codeName[0].get('code', []) != []:
+                approval[0]['project_code'] = codeName[0].get('code', [])[0]
+            if 'project_name' not in approval[0] and codeName[0].get('name', '') != '':
+                approval[0]['project_name'] = codeName[0].get('name', '')
+        return approval
+
 class BiddingScore():
     def __init__(self):
         self.head_rule_dic = {
@@ -8353,8 +8573,8 @@ class EntityTypeRulePredictor():
     def __init__(self):
         self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选))?(会议)?地[点址区]([((]网址[))])?[:为]'
         self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址区]([((]网址[))])?[:为]'
-        self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|供货|卸货)((期|时间)[及和、])?)?地[点址区]?[:为]'
-        self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|展示|看样|拍卖)(实施|服务|现场)?(地[点址区]|位置|所在地区?)(位于)?[:为]|项目位于|所在(区域|地区):|存放地[点址]?[:为]'
+        self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|供货|卸货)((期|时间)[及和、])?)?(地[点址区]?|区域)[:为]'
+        self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|展示|看样|拍卖)(实施|服务|现场)?(地[点址区]|位置|所在地区?)(位于)?[:为]|项目位于|[^\w][属](区域|地区):|存放地[点址]?[:为]' # 银行所属区域:北京市西城区 不作项目地址
         self.pattern_addr_contact = '(联系|收件人?|邮寄)地[点址区][:为]|行政区:'
         self.pattern_time_planned = '(计划|预计|预期)(采购|招标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
         self.pattern_code_investment = '投资(审批)?项目[编代]码[:为]'