소스 검색

优化审批提取、多中标人、角色金额、地区匹配、公告类别

lsm 8 달 전
부모
커밋
ec709c3122

+ 1 - 1
BiddingKG/dl/interface/Entitys.py

@@ -342,7 +342,7 @@ class Role():
         result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
                   'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
                   'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}
-        if result['role_name'] == 'tenderee':
+        if result['role_name'] in ['tenderee', 'win_tenderer']:
             result['role_prob'] = self.role_prob
         if result['role_name'] == 'win_tenderer' and self.multi_winner != set():
             self.multi_winner.add(result['role_text'])

+ 12 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -708,6 +708,8 @@ def tableToText(soup, docid=None):
         for i in range(len(inner_table)):
             for j in range(len(inner_table[i])):
                 inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
+                if origin_inner_table[i][j][0] in ['主要环境影响及预防或者减轻不良环境影响的对策和措施', '建设单位或地方政府作出的相关环保承诺', '公众反馈意见的联系方式'] and predict_list[i][j]!=1:
+                    inner_table[i][j] = [origin_inner_table[i][j][0], 1]
 
         if show:
             print(inner_table)
@@ -2963,6 +2965,11 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
             article_processed = article_processed[:idx]
         for it in re.finditer('[一二三四五六七八九十\d]、中标候选人名称,', article_processed): # 修复大纲类标点导致提取不到,例:515521734
             article_processed = re.sub(it.group(0), it.group(0)[:-1]+':', article_processed)
+        ser = re.search('项目[编代][码号]/项目名称:(?P<code>(\[审批\])?[\d\-]{10,30})/?(?P<name>[\u4e00-\u9fa5()]{4,35}[,。])', article_processed) # 优化项目编号名称一起写的情况 spxm-53340116.html
+        if ser:
+            article_processed = article_processed.replace(ser.group(0), '项目代码:%s,项目名称:%s' % (
+            ser.group('code'), ser.group('name')))
+        article_processed = re.sub('四舍五入至', '', article_processed) # 修复 533537050 ,中标价(四舍五入至万元):6468万元
 
         '''去除业绩内容'''
         article_processed = del_achievement(article_processed)
@@ -3055,6 +3062,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
         cost_time[key_preprocess] += time.time()-start_time
 
         #nlp处理
+        outline_list = [] # 20240906 修复下面条件不成立时,后面 list_outlines.append(outline_list) 名称未定义报错
         if article_processed is not None and len(article_processed)!=0:
             split_patten = "。"
             sentences = []
@@ -3182,7 +3190,7 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
     # 使用正则识别金额
     entity_type = "money"
     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
+                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(\d:|\d=\d[-+×]\d:)?(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
                           "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
     # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元,合同金额:378.8万元 提取
@@ -4042,6 +4050,9 @@ if __name__=="__main__":
     # getPredictTable()
 
     text = '是否拟中标人:是,评标排名:1,价格类型:(万元)报价:13311.1582,得分:84.46,项目负责人:邓焱文'
+    text = ',采购包1:采购包预算金额(元:1,500000.00,采购包最高限价(元:1,430600.00,'
+    text = '成交人:中坤电力有限公司,成交价格:11493,603.52元,质量:合格,项目工期:117天,'
+    # text = '数量及单位1:65台,单价2:800,投标报价3=1×2:52000。'
     print(get_money_entity(text, found_yeji=0))
     # with open('D:/138786703.html', 'r', encoding='utf-8') as f:
     #     sourceContent = f.read()

+ 9 - 5
BiddingKG/dl/interface/extract.py

@@ -281,6 +281,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     log("get codename done of doc_id%s"%(doc_id))
     cost_time["codename"] = round(time.time()-start_time,2)
 
+    start_time = time.time()  # 公告类别预测
+    channel_dic, msc = predictor.getPredictor("channel").predict_merge(title, list_sentences[0], text,original_docchannel, web_source_no)
+    cost_time["rule_channel"] = round(time.time() - start_time, 2)
+
     start_time = time.time() # 角色金额模型提取
     predictor.getPredictor("prem").predict(list_sentences,list_entitys)
     log("get prem done of doc_id%s"%(doc_id))
@@ -299,7 +303,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     cost_time["product_attrs"] = round(time.time()-start_time,2)
 
     start_time = time.time() #正则角色提取
-    predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName, all_winner=is_all_winner(title))
+    predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName, channel_dic, all_winner=is_all_winner(title))
     cost_time["rule"] = round(time.time()-start_time,2)
 
     '''正则补充最后一句实体日期格式为招标或代理 2021/12/30;正则最后补充角色及去掉包含 公共资源交易中心 的招标人'''
@@ -383,9 +387,9 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
         prem[0]['prem'] = {}  # 审批项目不要这项
 
     else:
-        channel_dic, msc = predictor.getPredictor("channel").predict_merge(title,list_sentences[0], text, list_articles[0].bidway, prem[0], original_docchannel)
+        channel_dic, msc = predictor.getPredictor("channel").final_change(channel_dic, prem[0], title, text, original_docchannel, msc)
     # print('msc', msc)
-    cost_time["rule_channel"] = round(time.time()-start_time,2)
+    cost_time["rule_channel2"] = round(time.time()-start_time,2)
 
     '''一包多中标人提取及所有金额提取'''
     all_moneys = getAttributes.get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, is_all_winner(title))
@@ -442,11 +446,11 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-09-05'}
+    version_date = {'version_date': '2024-09-26'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:
-        approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys)
+        approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys, text)
         data_res['approval'] = approval
 
     if channel_dic['docchannel']['doctype'] == '处罚公告': # 20240627 处罚公告进行失信要素提取

+ 14 - 4
BiddingKG/dl/interface/getAttributes.py

@@ -583,7 +583,7 @@ def getPackageScopePattern():
     for item in df["list_word"]:
         item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
         pattern += item+"|"
-    pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
+    pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}|##attachment##"
     return pattern
         
 pattern_packageScope = getPackageScopePattern()   
@@ -4427,7 +4427,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, a
                                 if d.get('role_name', '') == 'win_tenderer':
                                     if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
                                         d['multi_winner'] = ','.join(set(multi_winner))
-        else:
+        elif 0 < len(prem[0]) < 3:
             multi_winner = set([it[0] for it in winner_l]) - tenderee_or_agency
             if len(multi_winner) > 1:
                 for project in prem[0].values():
@@ -4629,7 +4629,7 @@ def update_prem(old_prem, new_prem, in_attachment=False):
 
     # return old_prem
 
-def  confirm_prem(prem, channel_dic):
+def confirm_prem(prem, channel_dic):
     '''
     规则检查纠正prem,如果Project包中标人在其他包中标人,去掉project包中标角色;如果有其他包中标人,去掉roleList为空的包;
     :param prem: prem 字段字典
@@ -4638,6 +4638,8 @@ def  confirm_prem(prem, channel_dic):
     if len(prem) > 1:  # 表格提取到中标人的,去掉project包中标人
         pro_winner = set()
         other_winner = set()
+        other_winner_prob = 0
+        pro_winner_prob = 0
         empty_roleList = []
         for k in prem:
             prem[k]['uuid'] = str(uuid.uuid4()) # 20240627 每个包都添加uuid
@@ -4651,16 +4653,24 @@ def  confirm_prem(prem, channel_dic):
                             pro_winner.update(set(d['win_tenderer_joint'].split(',')))
                         if 'multi_winner' in d:
                             pro_winner.update(set(d['multi_winner'].split(',')))
+                        if d['role_name'] == 'win_tenderer' and d.get('role_prob', 0)>0.6:
+                            pro_winner_prob = d.get('role_prob', 0)
                     else:
                         other_winner.add(d['role_text'])
                         if 'win_tenderer_joint' in d:
                             other_winner.update(set(d['win_tenderer_joint'].split(',')))
                         if 'multi_winner' in d:
                             other_winner.update(set(d['multi_winner'].split(',')))
-        if pro_winner & other_winner != set():
+                        if d['role_name'] == 'win_tenderer' and d.get('role_prob', 0)>0.6:
+                            other_winner_prob = d.get('role_prob', 0)
+        if pro_winner & other_winner != set() or other_winner_prob>pro_winner_prob: # 如果默认包与其他包中标人重复或其他包中标人概率比默认包大,删除默认包中标人
             prem['Project']['roleList'] = [d for d in prem['Project']['roleList'] if
                                                d['role_name'] not in ['win_tenderer', 'second_tenderer',
                                                                       'third_tenderer']]
+        elif other_winner_prob<pro_winner_prob and len(prem)==2: # 两个包情况,如果默认包中标人概率比其他包大,删除其他包
+            rm_k = [k for k in prem if k != 'Project']
+            for k in rm_k:
+                prem.pop(k)
         if other_winner and channel_dic['docchannel']['docchannel'] in ['中标信息', '候选人公示', '合同公告']:
             for k in empty_roleList:
                 prem.pop(k)

+ 1 - 0
BiddingKG/dl/interface/modelFactory.py

@@ -104,6 +104,7 @@ class Model_role_classify_word():
             text = re.sub('(最终)?排名:', '    ', text)
         text = re.sub('交易单位', '发布单位', text)
         text = re.sub('[,:]各种数据:', ':', text) # 20240620优化 478331984 山东省交通运输厅站源提取不到 各种数据:中标单位,各种数据:济南金曰公路工程有限公司,
+        text = re.sub('电子签章', '', text) # 20240924 修复 529923459 电子签名:投标人名称(电子签章:西君兰信息科技有限公司,2024年9月7日 预测为中标
         return text.replace('(', '(').replace(')', ')').replace('單', '单').replace('稱','承').replace('標', '标').replace('採購', '采购').replace('機構', '机构')
 
     def encode_word(self, sentence_text, begin_index, end_index, size=20, **kwargs):

+ 280 - 141
BiddingKG/dl/interface/predictor.py

@@ -870,7 +870,7 @@ class PREMPredict():
                 elif re.search('^,?(投标报价|(资格性审查:|符合性审查:)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]名', front)==None:
                     values[2] = 0.5
                     label = 5
-                elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$|确定为标的的受让方,$|[主次出]入口?,?$|确定(项目|\w{,2})成交供应商,$|,承刻单位:$', front):  # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位: 241929628 1月9,承刻单位: 肃宁县超凡网络光敏印章刻印部 ,印章预留印模
+                elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$|确定为标的的受让方,$|[主次出]入口?,?$|确定(项目|\w{,2})成交供应商,$|,承刻单位:$|乙方接受为$', front):  # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位: 241929628 1月9,承刻单位: 肃宁县超凡网络光敏印章刻印部 ,印章预留印模
                     label = 5
                 elif re.search(',来源:$', front) and re.search('^,', behind): # 修复 472062585 项目采购-关于定制手机询比价采购中标公告,来源:深圳市网联安瑞网络科技有限公司 预测为中标
                     label = 0
@@ -1559,7 +1559,7 @@ class RoleRulePredictor():
         return (_label, _prob, _flag, keyword)
 
 
-    def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5, all_winner=False):
+    def predict(self, list_articles, list_sentences, list_entitys, list_codenames, channel_dic, on_value=0.5, all_winner=False):
 
         for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
                                                                       list_codenames):
@@ -1827,9 +1827,7 @@ class RoleRulePredictor():
                                     p_entity.values[0] = 0.8 + p_entity.values[0] / 10
                                     p_entity.label = 0
                                     # print('规则召回预算金额2:', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin])
-            if notfound_tenderer and len(set([ent.entity_text for ent in candidates])) == 1 and re.search(
-                    '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|磋商|交易|评审)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书',
-                    article.title+article.content[:100]):
+            if notfound_tenderer and len(set([ent.entity_text for ent in candidates])) == 1 and channel_dic['docchannel']['docchannel'] in ['中标信息', '候选人公示', '合同公告']:
                 for p_entity in candidates:
                     # print('只有一个候选人的作为中标人', p_entity.entity_text)
                     p_entity.label = 2
@@ -3562,6 +3560,8 @@ class ProductAttributesPredictor():
                                     if link['unitPrice'] != "" and link['quantity'] != '':
                                         try:
                                             total_product_money += float(link['unitPrice'])*float(link['quantity']) if float(link['quantity'])<50000 else 0
+                                            if float(link['unitPrice'])>10000 and float(link['quantity'])>100: # 修复 325105750 总价做单价 造成中标金额错误
+                                                total_product_money = 0
                                         except:
                                             log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
 
@@ -4034,9 +4034,9 @@ class DocChannel():
           '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|(供应|招标)计划表?$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
           '公告变更': '第[\d一二]次变更|(变更|更正(事项)?|更改|延期|暂停)(招标|采购)?的?(公告|公示|通知)|变更$|更正$',
           '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告|$)',
-          '废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|撤回|取消成?交?|流拍)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置',
+          '废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|撤回|取消成?交?|流拍|停止)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置',
           '合同公告': '(合同(成交|变更)?)(公告|公示|信息|公式|公开|签订)|合同备案|合同书|合同$', # |(履约|验收)(结果)?
-          '候选人公示': '候选人(变更)?公示|评标(结果)?公示|评审结果', #中标前公示|中标预公示|
+          '候选人公示': '候选人(变更)?公示|评标(结果)?([告]|报告)|评审结果', #中标前公示|中标预公示|
           '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$|项目中标', # |开标(记录|信息|情况)
           '资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
           '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
@@ -4271,14 +4271,13 @@ class DocChannel():
               log('正则把中标信息修改为空')
       return channel_dic
 
-  def predict_merge(self, title, list_sentence, html, bidway, prem, original_docchannel='', web_source_no=''):
+  def predict_merge(self, title, list_sentence, html, original_docchannel='', web_source_no=''):
       '''
       正则,模型混合预测,返回公告类型及生命周期
       :param title:  公告标题
       :param content: 预处理后的返回的句子实体列表 list_sentence
       :param html: 公告原文 html 内容
       :param bidway: 招标方式
-      :param prem: 提取的prem 字典
       :return: {'docchannel': {'docchannel':'中标信息', 'doctype':'采招数据'}} 字典格式
       '''
       def cut_single_cn_space(text):
@@ -4315,12 +4314,6 @@ class DocChannel():
                   kw.append(re.search(p, text).group(0))
           return num, ';'.join(kw)
 
-      def is_contain_winner(extract_json):
-          if re.search('win_tenderer', extract_json):
-              return True
-          else:
-              return False
-
       def is_single_source(bidway, title):
           if re.search('单一来源|单一性采购', title):
               return True
@@ -4407,12 +4400,16 @@ class DocChannel():
           if '采购意向' in life_kw_title or '采购意向' in life_list:
               if '中标信息' in life_kw_title or '中标信息' in life_list:
                   return '中标信息', msc
+              elif '候选人公示' in life_kw_title:
+                  return '候选人公示', msc
               elif set(['候选人公示', '合同公告']) & set(life_kw_title) != set():
                   return '', msc
               return '采购意向', msc
           elif '招标预告' in life_kw_title or '招标预告' in life_list:
               if '中标信息' in life_kw_title or '中标信息' in life_list:
                   return '中标信息', msc
+              elif '候选人公示' in life_kw_title:
+                  return '候选人公示', msc
               elif set(['候选人公示', '合同公告']) & set(life_kw_title) != set():
                   return '', msc
               return '招标预告', msc
@@ -4435,8 +4432,6 @@ class DocChannel():
                   return '', msc
               return '招标答疑', msc
           elif '开标记录' in life_kw_title:
-              if '开标结果' in title and is_contain_winner(prem_json):
-                  return '中标信息', msc
               return '开标记录', msc
           elif '验收合同' in life_kw_title:
               return '验收合同', msc
@@ -4514,86 +4509,6 @@ class DocChannel():
           prob = pred[0][id]
           return id, prob
 
-      def final_change(msc):
-          '''
-          修改逻辑:
-          1、中标公告、合同公告无中标人且原始为非中标,返回原类型
-          2、废标公告有中标人且标题无废标关键词,返回中标信息
-          3、答疑公告标题无答疑关键且原始为招标,返回原始类别
-          4、招标公告有中标人且原始为中标,返回中标信息
-          5、预测为招标,原始为预告、意向,返回原始类别
-          6、预测及原始均在变更、答疑,返回原始类别
-          7、预测为采招数据,原始为产权且有关键词,返回原始类别
-          8、废标公告原始为招标、预告且标题无废标关键期,返回原始类别
-          9、若预测为非采招数据且源网为采招数据且有招标关键词返回采招数据
-          10、招标公告有中标人,且标题有直购关键词,改为中标信息
-          11、预测预告,原始为意向、招标且标题无预告关键词,返回原始类别
-          '''
-          if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
-                  original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(
-              prem_json)==False and re.search(self.title_life_dic['中标信息'], title)==None:
-              result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
-              msc += '最终规则修改:中标公告、合同公告无中标人且原始为非中标,返回原类型'
-          elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
-                  self.title_life_dic['废标公告'], title) == None:
-              result['docchannel']['docchannel'] = '中标信息'
-              msc += '最终规则修改:预测为废标却有中标人且标题无废标关键词改为中标信息;'
-          elif result['docchannel']['docchannel'] in ['招标答疑'] and re.search(
-                  self.title_life_dic['招标答疑'], title) == None and origin_dic.get(
-                  original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
-              result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
-              msc += '最终规则修改:答疑公告标题无答疑关键且原始为招标,返回原始类别;'
-          elif result['docchannel']['docchannel'] == '招标公告' and is_contain_winner(prem_json) and origin_dic.get(
-                  original_docchannel, '') == '中标信息':
-              result['docchannel']['docchannel'] = '中标信息'
-              msc += '最终规则修改:预测为招标公告却有中标人且原始为中标改为中标信息;'
-          elif result['docchannel']['docchannel'] in ['招标公告'] and origin_dic.get(
-                  original_docchannel, '') in ['采购意向', '招标预告']:
-              result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
-              msc += '最终规则修改:预测为招标,原始为预告、意向,返回原始类别'
-          elif result['docchannel']['docchannel'] in ['招标预告'] and origin_dic.get(
-                  original_docchannel, '') in ['采购意向', '招标公告'] and re.search(
-              self.title_life_dic['招标预告'], title)==None:
-              result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
-              msc += '最终规则修改:预测预告,原始为意向、招标且标题无预告关键词,返回原始类别'
-          elif result['docchannel']['docchannel'] in ['招标答疑', '公告变更'] and origin_dic.get(
-                  original_docchannel, '') in ['招标答疑', '公告变更']:
-              result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
-              msc += '最终规则修改:预测及原始均在答疑、变更,返回原始类别'
-          elif result['docchannel']['doctype'] == '采招数据' and origin_dic.get(
-                  original_docchannel, '') in ['产权交易', '土地矿产'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产', text):
-              result['docchannel']['doctype'] = origin_dic.get(original_docchannel, '')
-              msc += '最终规则修改:预测为采招数据,原始为产权且有关键词,返回原始类别'
-          elif result['docchannel']['docchannel'] == '废标公告' and origin_dic.get(
-                  original_docchannel, '') in ['招标公告', '采购意向', '招标预告'] and re.search(
-                  self.title_life_dic['废标公告'], title) == None:
-              result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
-              msc += '最终规则修改:废标公告原始为招标、预告且标题无废标关键期,返回原始类别;'
-          elif result['docchannel']['docchannel'] in ['招标公告', '招标预告'] and is_contain_winner(
-                  prem_json) and re.search('直购', title):
-              result['docchannel']['docchannel'] = '中标信息'
-              msc += "最终规则修改:预测为招标却有中标人且标题有直购关键词返回中标"
-
-          if result['docchannel']['doctype'] in ['产权交易', '土地矿产', '拍卖出让'] and origin_dic.get(
-                  original_docchannel, '') not in ['产权交易', '土地矿产', '拍卖出让'] \
-                and (re.search(self.title_type_dic['采招数据'], title) or re.search('工程|服务|采购|询价|磋商', title) or re.search('(采购|招投?标|投标)(信息|内容|项目|公告|数量|人|单位|方式)|(建设|工程|服务|施工|监理|勘察|设计)项目|(%s)'%self.type_dic['采招数据'], text)):
-              result['docchannel']['doctype'] = '采招数据'
-              msc += ' 最终规则修改:预测为非采招数据,原始为采招数据且有招标关键词,返回采招数据'
-          elif result['docchannel']['doctype'] in ['土地矿产'] and origin_dic.get(original_docchannel, '') in ['拍卖出让', '产权交易']:
-              if origin_dic.get(original_docchannel, '') in ['拍卖出让'] and (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
-                  result['docchannel']['doctype'] = '拍卖出让'
-                  msc += "最终规则修改:预测为土地矿产原始为拍卖且有拍卖关键词,返回拍卖"
-              elif (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)):
-                  result['docchannel']['doctype'] = '产权交易'
-                  msc += "最终规则修改:预测为土地矿产原始为产权交易且有产权交易关键词,返回产权交易"
-
-          '''下面是新格式增加返回字段'''
-          if result['docchannel']['docchannel'] != '':  # 预测到生命周期的复制到life_docchannel,否则用数据源结果
-              result['docchannel']['life_docchannel'] = result['docchannel']['docchannel']
-          else:
-              result['docchannel']['life_docchannel'] = origin_dic.get(original_docchannel, '原始类别')
-          return msc
-
       not_extract_dic = {
           104: '招标文件',
           106: '法律法规',
@@ -4628,10 +4543,13 @@ class DocChannel():
        118: '废标公告',
        119: '候选人公示',
        120: '合同公告'}
+
+      self.origin_dic = origin_dic
+
       if original_docchannel in not_extract_dic:
           return {'docchannel': {'docchannel': '', 'doctype': not_extract_dic[original_docchannel], 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '公告类别不在提取范围'
       if web_source_no in ['02104-7', '04733', 'DX007628-6']: # 这些数据源无法识别
-          return {'docchannel': {'docchannel': '', 'doctype': '采招数据', 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '此数据源公告分类不明确,返回数据源类别'
+          return {'docchannel': {'docchannel': origin_dic.get(original_docchannel, '原始类别'), 'doctype': '采招数据', 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '此数据源公告分类不明确,返回数据源类别'
       if original_docchannel == 303:
           return {'docchannel': {'docchannel': '处罚公告', 'doctype': '处罚公告', 'life_docchannel': '处罚公告'}}, "源类别为处罚公告"
 
@@ -4640,7 +4558,7 @@ class DocChannel():
           title = title[:20] + title[-30:]
 
       text = html2text(html)
-      prem_json = json.dumps(prem, ensure_ascii=False)
+
       result = {'docchannel': {'docchannel': '', 'doctype': ''}}
 
       doc_type, type_kw = get_type(title, text)
@@ -4674,10 +4592,113 @@ class DocChannel():
                       result['docchannel']['docchannel'] = life_model
                       msc += life_model + ' 概率:%.4f;\n'%life_prob
 
-      msc = final_change(msc)
+      # msc = final_change(msc)
       # print('channel ', msc)
       return result, msc
 
+  def final_change(self, result, prem, title, text, original_docchannel, msc):
+      '''
+
+      :param result: channel 结果字典
+      :param prem:
+      :param title: 标题
+      :param text: 正文
+      :param original_docchannel: 站源类别
+      :param msc: 备注
+      :return: channel结果字典
+      '''
+      '''
+      修改逻辑:
+      1、中标公告、合同公告无中标人且原始为非中标,返回原类型
+      2、废标公告有中标人且标题无废标关键词,返回中标信息
+      3、答疑公告标题无答疑关键且原始为招标,返回原始类别
+      4、招标公告有中标人且原始为中标,返回中标信息
+      5、预测为招标,原始为预告、意向,返回原始类别
+      6、预测及原始均在变更、答疑,返回原始类别
+      7、预测为采招数据,原始为产权且有关键词,返回原始类别
+      8、废标公告原始为招标、预告且标题无废标关键期,返回原始类别
+      9、若预测为非采招数据且源网为采招数据且有招标关键词返回采招数据
+      10、招标公告有中标人,且标题有直购关键词,改为中标信息
+      11、预测预告,原始为意向、招标且标题无预告关键词,返回原始类别
+
+      '''
+      def is_contain_winner(extract_json):
+          if re.search('win_tenderer', extract_json):
+              return True
+          else:
+              return False
+
+      origin_dic = self.origin_dic
+      prem_json = json.dumps(prem, ensure_ascii=False)
+      if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
+              original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(
+          prem_json) == False and re.search(self.title_life_dic['中标信息'], title) == None:
+          result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
+          msc += '最终规则修改:中标公告、合同公告无中标人且原始为非中标,返回原类型'
+      elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
+              self.title_life_dic['废标公告'], title) == None:
+          result['docchannel']['docchannel'] = '中标信息'
+          msc += '最终规则修改:预测为废标却有中标人且标题无废标关键词改为中标信息;'
+      elif result['docchannel']['docchannel'] in ['招标答疑'] and re.search(
+              self.title_life_dic['招标答疑'], title) == None and origin_dic.get(
+          original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
+          result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
+          msc += '最终规则修改:答疑公告标题无答疑关键且原始为招标,返回原始类别;'
+      elif result['docchannel']['docchannel'] == '招标公告' and is_contain_winner(prem_json) and origin_dic.get(
+              original_docchannel, '') == '中标信息':
+          result['docchannel']['docchannel'] = '中标信息'
+          msc += '最终规则修改:预测为招标公告却有中标人且原始为中标改为中标信息;'
+      elif result['docchannel']['docchannel'] in ['招标公告'] and origin_dic.get(
+              original_docchannel, '') in ['采购意向', '招标预告']:
+          result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
+          msc += '最终规则修改:预测为招标,原始为预告、意向,返回原始类别'
+      elif result['docchannel']['docchannel'] in ['招标预告'] and origin_dic.get(
+              original_docchannel, '') in ['采购意向', '招标公告'] and re.search(
+          self.title_life_dic['招标预告'], title) == None:
+          result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
+          msc += '最终规则修改:预测预告,原始为意向、招标且标题无预告关键词,返回原始类别'
+      elif result['docchannel']['docchannel'] in ['招标答疑', '公告变更'] and origin_dic.get(
+              original_docchannel, '') in ['招标答疑', '公告变更']:
+          result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
+          msc += '最终规则修改:预测及原始均在答疑、变更,返回原始类别'
+      elif result['docchannel']['doctype'] == '采招数据' and origin_dic.get(
+              original_docchannel, '') in ['产权交易', '土地矿产'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产', text):
+          result['docchannel']['doctype'] = origin_dic.get(original_docchannel, '')
+          msc += '最终规则修改:预测为采招数据,原始为产权且有关键词,返回原始类别'
+      elif result['docchannel']['docchannel'] == '废标公告' and origin_dic.get(
+              original_docchannel, '') in ['招标公告', '采购意向', '招标预告'] and re.search(
+          self.title_life_dic['废标公告'], title) == None:
+          result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
+          msc += '最终规则修改:废标公告原始为招标、预告且标题无废标关键期,返回原始类别;'
+      elif result['docchannel']['docchannel'] in ['招标公告', '招标预告'] and is_contain_winner(
+              prem_json) and re.search('直购', title):
+          result['docchannel']['docchannel'] = '中标信息'
+          msc += "最终规则修改:预测为招标却有中标人且标题有直购关键词返回中标"
+      elif result['docchannel']['docchannel'] == '开标记录' and '开标结果' in title and is_contain_winner(prem_json):
+          msc += "最终规则修改:开标结果包含中标人的作为中标信息"
+          result['docchannel']['docchannel'] = '中标信息'
+      if result['docchannel']['doctype'] in ['产权交易', '土地矿产', '拍卖出让'] and origin_dic.get(
+              original_docchannel, '') not in ['产权交易', '土地矿产', '拍卖出让'] \
+              and (re.search(self.title_type_dic['采招数据'], title) or re.search('工程|服务|采购|询价|磋商', title) or re.search(
+          '(采购|招投?标|投标)(信息|内容|项目|公告|数量|人|单位|方式)|(建设|工程|服务|施工|监理|勘察|设计)项目|(%s)' % self.type_dic['采招数据'], text)):
+          result['docchannel']['doctype'] = '采招数据'
+          msc += ' 最终规则修改:预测为非采招数据,原始为采招数据且有招标关键词,返回采招数据'
+      elif result['docchannel']['doctype'] in ['土地矿产'] and origin_dic.get(original_docchannel, '') in ['拍卖出让', '产权交易']:
+          if origin_dic.get(original_docchannel, '') in ['拍卖出让'] and (
+                  re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
+              result['docchannel']['doctype'] = '拍卖出让'
+              msc += "最终规则修改:预测为土地矿产原始为拍卖且有拍卖关键词,返回拍卖"
+          elif (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)):
+              result['docchannel']['doctype'] = '产权交易'
+              msc += "最终规则修改:预测为土地矿产原始为产权交易且有产权交易关键词,返回产权交易"
+
+      '''下面是新格式增加返回字段'''
+      if result['docchannel']['docchannel'] != '':  # 预测到生命周期的复制到life_docchannel,否则用数据源结果
+          result['docchannel']['life_docchannel'] = result['docchannel']['docchannel']
+      else:
+          result['docchannel']['life_docchannel'] = origin_dic.get(original_docchannel, '原始类别')
+      return result, msc
+
 # 保证金支付方式提取
 class DepositPaymentWay():
     def __init__(self,):
@@ -6001,7 +6022,7 @@ class DistrictPredictor():
             return province_l, city_l, district_l
 
         def get_pro_city_dis_score(text, text_weight=1):
-            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾', ' ', text)
+            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区', ' ', text)
             text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
             text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域:怒江州 识别为广西 - 崇左 - 江州
             text = re.sub('茂名滨海新区', '茂名市', text)
@@ -6275,6 +6296,8 @@ class DistrictPredictor():
         text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  # 预防提取错 合肥 路南 新会 等地区
 
         if pro_addr and re.search('\w{2,}([省市县旗盟]|自治[区州县旗])', pro_addr):
+            if re.search('[市县旗盟]', pro_addr)==None: # 修复 486623506 项目地址不完整
+                pro_addr = text1 + ' '+ pro_addr
             msc += '## 使用项目地址输入:%s ##;' % pro_addr
             rs = self.get_area(pro_addr, '')
             msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
@@ -6422,7 +6445,6 @@ def is_head_line(list_item):
     predict_y = getPredictor("form").predict(np.array(x), type="item")
     count = 0
     for item, values in zip(list_item, list(predict_y)):
-        print(item, values[1])
         if values[1] > 0.6:
             count += 1
     if count/len(list_item)>0.6:
@@ -6511,6 +6533,8 @@ class TablePremExtractor(object):
                     elif re.search('^((投标|应答|响应|候选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|(存款|投标)?银行|供应商)(名称)?$|^机构名称$|^单位(名称)?$', text) and re.search('未', text)==None:
                         other_tenderer2 = (i, text)
                 if num>1:
+                    if re.search(self.head_rule_dic['project_code'], text) and re.search(self.head_rule_dic['package_code'], text): # 修复 528486798 分标编号-包号
+                        continue
                     # print('表头错误,一个td匹配到两个表头:', header_dic)
                     return flag, contain_header, dict(), not_sure_winner
             if re.search(';金额((万?元))?;', ';'.join(td_list)):  # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
@@ -7399,21 +7423,41 @@ class WebsourceTenderee():
                                               ]}
         return prem
 
+def get_header_line(list_item):
+    '''
+    判断列表内文本哪些是表头,哪些不是
+    :param list_item: [ '批复结果', '许可/同意', '批复文号',]
+    :return:
+    '''
+    rs = []
+    x = []
+    for item in list_item:
+        x.append(getPredictor("form").encode(item))
+    predict_y = getPredictor("form").predict(np.array(x), type="item")
+    for item, values in zip(list_item, list(predict_y)):
+        lb = 1 if values[1] > 0.5 else 0
+        if item in ['许可/同意', '办结(通过)', '办结(准予许可)','批准']:
+            lb = 0
+        elif item in ['环境影响评价机构', '建设单位或地方政府作出的相关环保承诺']:
+            lb = 1
+        rs.append(lb)
+    return rs
+
 class ApprovalPredictor():
     def __init__(self):
         '''
         项目(法人)单位
         '''
         self.other_part = {
-            "project_name": "((项目|工程|采购|招标|计划|建设|规划)名称?|生产建设项目|申请项目):(?P<main>[^:。]{5,50})[,。](\w{2,10}:|$)?", # 项目名称
-            "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标|计划|任务|备案|索引)(编[号码]|号)):?(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)(\w{2,10}:|$)?", # 项目编号
-            "doc_num": "((审[批查核]|批[复准]|立项|[定知]书|[公发批]文|用地|决定|备案|核准|许可|确认|受理|申请报告|文件|意见书|办件)[文编]?号|综合受理号|文书?号|合格书号):?(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-.]{5,30}号?)[,。]?(\w{2,10}:|$)?", # 文号
-            "pro_type": "((申[报请]|审核备|项目|立项)(类型|种类)|项目所属行业|行业(分类|归属)|产业领域|项目行业):(?P<main>[^:。]{2,30})[,。](\w{2,10}:|$)?", # 项目类型
-            "year_limit": "((建设|工程|服务|项目)(起止|\w{,2})?(年限|期限|时长|工期)):(约|超过|大概|建设工期|共计|合计)?(?P<main>[\d一二三四五六七八九十]+个月|\d{1,3}(日?历?天|小时)|20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?([至—-]+20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?)?)[(,。](\w{2,10}:|$)?", # 建设年限
-            "construction_scale": "(建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|(建设|招标|采购))?内容|(建设|工程|项目)(主要)?(规模|内容|概况|面积)([及和](主要)?(规模|内容|概况|面积))?(如下)?):(?P<main>[^:。]{2,250})[,。](\w{2,10}:|$)?", # 建设规模
-            "approval_items": "((审[批查核]|批[复准]|申请|监管)(事项|内容|名称)|事项名称|事项审批):(?P<main>[^:。]{2,70})[,。](\w{2,10}:|$)?", # 审批事项
-            "properties": "((建设|工程|项目)性质):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?", # 建设性质
-            "approval_result": "((审[批查核]|批[复准]|核[发准]|许可|抽查|备案)(结果|决定|结论|状态|回复|意见)|(办[理件]|,)(状态|意见|结果)|项目(当前|目前)?状态):(?P<main>[^:。]{2,20})[,。](\w{2,10}:|$)?", # 审批结果
+            "project_name": "((项目|工程|采购|招标|计划|建设|规划)名称?|生产建设项目|申请项目):(?P<main>[^:。]{5,50})[,。]([\w()]{2,15}:|$)?", # 项目名称
+            "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标|计划|任务|备案|索引)([代][号码]|号)):?(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)([\w()]{2,15}:|$)?", # 项目编号
+            "doc_num": "((环评|\w{,3})(审[批查核]|批[复准]|立项|[定知]书|[公发批]文|用地|决定|备案|核准|许可|确认|受理|申请报告|文[件书]|意见书|办件)[文编证]?号|综合受理号|文书?号|合格书号|申报号|(办件|事项)[编代][号码]|收件号))?为?:?(?P<main>[()〔〕【】\[\]0-9]{,8}([\w()〔〕【】]{2,15})?[()〔〕【】\[\]a-zA-Z0-9-.]{3,30}号?)[,。]?([\w()]{2,15}:|$)?", # 文号
+            "pro_type": "((申[报请]|审核备|项目|立项)(类型|种类)|项目所属行业|行业(分类|归属)|产业领域|项目行业):(?P<main>[^:。]{2,30})[,。]([\w()]{2,15}:|$)?", # 项目类型
+            "year_limit": "((建设|工程|服务|项目)(起止|\w{,2})?(年限|期限|时长|工期)):(约|超过|大概|建设工期|共计|合计)?(?P<main>[\d一二三四五六七八九十]+个月|\d{1,3}(日?历?天|小时)|20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?([至—-]+20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?)?)[(,。]([\w()]{2,15}:|$)?", # 建设年限
+            "construction_scale": "([\d一二三四五六七八九十]{1,2}、|([\d一二三四五六七八九十]{1,2}))?(工程|项目|\w{,4})?((建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|(建设|招标|采购))?内容|(建设|工程|项目)(主要)?(规模|内容|概况|面积)([及和](主要)?(规模|内容|概况|面积))?(如下|为)?)|^规模(情况)?):(?P<main>[^:。]{2,500})[,。]?([\w()]{2,30}:|$)?", # 建设规模 #56924861 主要环境影响及预防或者减轻不良环境影响的对策和措施:
+            "approval_items": "((审[批查核]|批[复准]|申请|监管|受理)(事项|内容|名称)|事项名称|事项审批):(?P<main>[^:。]{2,150})[,。]([\w()]{2,15}:|$)?", # 审批事项
+            "properties": "((建设|工程|项目)性质):(?P<main>[^:。]{2,50})[,。]([\w()]{2,15}:|$)?", # 建设性质
+            "approval_result": "((审[批查核]|批[复准]|核[发准]|许可|抽查|备案)(结果|决定|结论|状态|回复|意见)|(办[理件]|,)(状态|意见|结果)|项目(当前|目前)?状态):(?P<main>[^:。]{2,20})[,。]([\w()]{2,15}:|$)?", # 审批结果
             "phone": "(联系)?电话:(?P<main>1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|" # 联系电话
                      '\+86.?1[3-9]\d{9}|'
                      '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|'
@@ -7423,12 +7467,12 @@ class ApprovalPredictor():
                      '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?|'
                      '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|'
                      '400\d{7}转\d{1,4}|'
-                     '[2-9]\d{6,7})[,。](\w{2,10}:|$)?'
+                     '[2-9]\d{6,7})[,。]([\w()]{2,15}:|$)?'
         }
 
         self.role_type = {
             "declare_company": "(申[请报]|填报|呈报)(人|部门|机关|单位|企业|公司|机构|组织)",  # 申报单位
-            "construct_company": "(业主|建设|用地|委托|发包|产权|项目))?(部门|机关|单位|企业|公司|方|业主)|主送机关|法人单位|甲方",  # 建设单位
+            "construct_company": "(业主|建设|用地|委托|发包|产权|项目|法人|采购|招标|询价))?(部门|机关|单位|企业|公司|方|业主|人)|主送机关|法人单位|甲方",  # 建设单位
             "approver": "(审[批查核议图]|许可|批[复准](用地)?|发证|管理|办理|受理|核[发准]|备案|承办))?(部门|机关|单位|企业|公司|机构)|实施主体",  # 审批部门
             "evaluation_agency": "(环境|环保)?(影响)?(环评|评价|评估)(机构|单位|公司)" , # 环评机构
             "compilation_unit": "编制单位", # 编制单位 20240701加
@@ -7446,14 +7490,38 @@ class ApprovalPredictor():
         }
 
         self.addr_type = {
-            "project_addr": "(建设|工程|项目|施工|地块|用地)\w{,2}(地址|地点|位置|所在地)|[宗土]地坐落" # 建设地址
+            "project_addr": "((建设|工程|项目|施工|地块|用地)\w{,2}(地址|地点|位置|所在地)|[宗土]地坐落)" # 建设地址
         }
 
         self.money_type = {
-            "total_tendereeMoney": "(项目|概算|投资)金额|项目投资|总投资|总预算|总概算|投资(规模|总额|估算|概算)|批复概算|投资额", # 总投资
+            "total_tendereeMoney": "(项目|概算|投资)金额|项目投资|总投资|总预算|总概算|投资(规模|总额|估算|概算)|批复概算|投资额|项目概算", # 总投资
         }
 
-    def predict(self, list_sentences, list_entitys, span=12):
+    def recursive_text(self, tag):
+        '''
+        递归获取 soup 节点文本
+        :param tag:
+        :return:
+        '''
+        texts = []
+        for child in tag.children:
+            if child.name:
+                if child.name in ['p'] and len(child.find_all('br'))>2:
+                    texts.extend(self.recursive_text(child))
+                if child.name in ["td", "th", "p", "li", "h1", "h2", "h3", "h4", "h5",
+                                  "h6"] and child.get_text().strip():
+                    texts.append(re.sub('\s', '', child.get_text().strip().replace(':', ':').replace('(', '(').replace(')', ')')))
+
+                else:
+                    texts.extend(self.recursive_text(child))
+            else:
+                if child.strip():
+                    texts.append(re.sub('\s', '', child.strip().replace(':', ':').replace('(', '(').replace(')', ')')))
+        return texts
+
+    def predict(self, list_sentences, list_entitys, html, span=12):
+        soup = BeautifulSoup(html)
+        texts_list = self.recursive_text(soup)
         rs_dic = {k: "" for k in
                   self.other_part.keys() | self.role_type.keys() | self.date_type.keys() | self.addr_type.keys() | self.money_type.keys() | self.person_type.keys()}
         rs_dic['moneysource'] = ""
@@ -7475,13 +7543,19 @@ class ApprovalPredictor():
                 b, e = entity.wordOffset_begin, entity.wordOffset_end
                 if entity.entity_type in ['org', 'company']:
                     flag = 1
+                    role_l = []
                     for k, v in self.role_type.items():
-                        if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
-                            if rs_dic[k] == '':
-                                rs_dic[k] = entity.entity_text
-                            multi_project[k] = entity.entity_text
-                            found_key = 1
-                            flag = 0
+                        ser = re.search(v, sentences[entity.sentence_index][max(0, b - span):b])
+                        if ser:
+                            role_l.append((k, ser.end()))
+                    if role_l:
+                        role_l = sorted(role_l, key=lambda x: x[1]) # 解决 400064746000 表格某个为空导致两个表头相近提取错误 申报单位名称:备案机关:海门经济技术开发区管理委员会,备案证号:海开审备〔2024〕346号
+                        k, _ = role_l[-1]
+                        if rs_dic[k] == '':
+                            rs_dic[k] = entity.entity_text
+                        multi_project[k] = entity.entity_text
+                        found_key = 1
+                        flag = 0
                     if flag and entity.entity_type == "org" and re.search('(局|委员会|委|厅)$', entity.entity_text):
                         org_set.add(entity.entity_text)
                 elif entity.entity_type in ['person']:
@@ -7493,15 +7567,22 @@ class ApprovalPredictor():
                             found_key = 1
                             break
                 elif entity.entity_type in ['time']:
+                    time_l = []
                     for k, v in self.date_type.items():
-                        if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
-                            time = timeFormat(entity.entity_text, default_first_day=False) if k in ['time_completion'] else timeFormat(entity.entity_text)
-                            if time == "":
-                                continue
-                            if rs_dic[k] == '':
-                                rs_dic[k] = time
-                            multi_project[k] = time
-                            found_key = 1
+                        ser = re.search(v, sentences[entity.sentence_index][max(0, b - span):b])
+                        if ser:
+                            time_l.append((k, ser.end()))
+                    if time_l:
+                        time_l = sorted(time_l, key=lambda x: x[1])
+                        k, end = time_l[-1]
+                        time = timeFormat(entity.entity_text, default_first_day=False) if k in [
+                            'time_completion'] else timeFormat(entity.entity_text)
+                        if time == "":
+                            continue
+                        if rs_dic[k] == '':
+                            rs_dic[k] = time
+                        multi_project[k] = time
+                        found_key = 1
                 elif entity.entity_type in ['location']:
                     for k, v in self.addr_type.items():
                         if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
@@ -7535,25 +7616,46 @@ class ApprovalPredictor():
                             rs_dic[k] = entity.entity_text
                         multi_project[k] = entity.entity_text
                         found_key = 1
-            for k, v in self.other_part.items():
-                for iter in re.finditer(v, text):
-                    if rs_dic[k] == '':
-                        rs_dic[k] = iter.group('main')
-                    multi_project[k] = iter.group('main')
+            for k, v in self.other_part.items(): # 规则提取非实体类信息
+                ser = re.search(v, text)
+                if ser:
+                    if rs_dic[k] == '' or (k == 'project_name' and ',审批事项:' in rs_dic[k]): # 修复 54087410 项目名称包含错误
+                        rs_dic[k] = ser.group('main')
+                    multi_project[k] = ser.group('main')
                     found_key = 1
-                    break
-            for k, v in self.date_type.items():
-                for iter in re.finditer(v+':?(?P<main>20\d{2}-\d{1,2}(-\d{1,2})?|20\d{2}/\d{1,2}(/\d{1,2})?|20\d{2}\.\d{1,2}(\.\d{1,2})?|20\d{2}(0[1-9]|1[0-2])(0[1-9]|[1-2][0-9]|3[0-1])?)', text): # 规则补充实体识别不到的日期时间
-                    time = timeFormat(iter.group('main'), default_first_day=False) if k in ['time_completion'] else timeFormat(iter.group('main'))
+            for k, v in self.date_type.items(): # 规则补充时间实体
+                if multi_project[k] != '':
+                    continue
+                ser = re.search(v+':?(?P<main>20\d{2}-\d{1,2}(-\d{1,2})?|20\d{2}/\d{1,2}(/\d{1,2})?|20\d{2}\.\d{1,2}(\.\d{1,2})?|20\d{2}(0[1-9]|1[0-2])(0[1-9]|[1-2][0-9]|3[0-1])?)', text)
+                if ser:# 规则补充实体识别不到的日期时间
+                    time = timeFormat(ser.group('main'), default_first_day=False) if k in ['time_completion'] else timeFormat(ser.group('main'))
                     if time == "":
                         continue
                     if rs_dic[k] == '':
                         rs_dic[k] = time
                     multi_project[k] = time
                     found_key = 1
-                    break
+            for k, v in self.addr_type.items(): # 规则补充地址实体 400063690529 实体不完整 建设地点:湖北省-咸宁市-通城县 通城县大坪乡沙口村15组(通城经济开发区)
+                ser = re.search(v + ':?(?P<main>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])',text)
+                if ser:
+                    if rs_dic[k] == '' or len(rs_dic[k]) < len(ser.group('main')):
+                        rs_dic[k] = ser.group('main')
+                    if len(multi_project[k]) < len(ser.group('main')):
+                        multi_project[k] = ser.group('main')
+                    found_key = 1
+            for k, v in self.role_type.items(): # 规则补充公司实体
+                if multi_project[k] != '':
+                    continue
+                ser = re.search('(%s):(?P<main>[\w()]{6,30}(局|发改|超市|棋牌室|店|(个体工商户)))[,。]'%self.role_type[k], text)
+                if ser:
+                    if rs_dic[k] == '':
+                        rs_dic[k] = ser.group('main')
+                    multi_project[k] = ser.group('main')
+
             if (multi_project['project_code'] != "" or multi_project['project_name'] != "") and multi_project['project_code']+multi_project['project_name'] not in code_name_set:
                 code_name_set.add(multi_project['project_code']+multi_project['project_name'])
+                if len(set([k for k,v in multi_project.items() if v!=''])-set(['project_name', 'project_code']))<2: # 除了包其他要素少于两个的不作为多包
+                    continue
                 district = getPredictor('district').get_area(
                     multi_project['approver'] + multi_project['project_name'] + multi_project['project_addr'], '')
                 if district['district']['province'] != '全国':
@@ -7561,7 +7663,7 @@ class ApprovalPredictor():
                     multi_project['province'] = district['district']['province']
                     multi_project['city'] = district['district']['city']
                     multi_project['district'] = district['district']['district']
-                multi_project = {k:v for k,v in multi_project.items() if v != ''}
+                multi_project = {k: v for k, v in multi_project.items() if v != ''}
                 rs_l.append(multi_project)
         if len(rs_l)>1 and len(set(rs_l[0].keys()))>2 and set(rs_l[0].keys())==set(rs_l[1].keys()):
             return rs_l
@@ -7575,6 +7677,41 @@ class ApprovalPredictor():
                 rs_dic['district'] = district['district']['district']
             if len(org_set) == 1 and rs_dic['approver'] == "":
                 rs_dic['approver'] == org_set.pop()
+
+            n = 0
+            scale_l = [] # 保存以建设规模开头的文本,如果只有一个且比原来长的替换为此文本,避免提取不完成情况
+            for text in texts_list: # 补充纠正内容
+                for k, v in self.other_part.items():
+                    kw = v.split(':')[0]
+                    if re.search('^(%s)$'%kw, text) and rs_dic[k]=='':  # 处理非表格表头内容 排列数据 例:400064764198,web_no: XM0016-5
+                        if n >1  and n+2 < len(texts_list) and get_header_line(texts_list[n-2:n+3]) == [1,0,1,0,1]:
+                            rs_dic[k] = texts_list[n+1]
+                        elif n in [0,1] and n+2 < len(texts_list) and get_header_line(texts_list[n:n+3]) == [1,0,1]:
+                            rs_dic[k] = texts_list[n + 1]
+                        elif n >1  and n+2 == len(texts_list) and get_header_line(texts_list[n-2:n+2]) == [1,0,1,0]:
+                            rs_dic[k] = texts_list[n + 1]
+                        elif k == 'construction_scale' and re.search('^(?[一二三四五六七八九十][)、]', text) and n+1 < len(texts_list): # 大纲 例:53375037
+                            rs_dic[k] = texts_list[n + 1]
+                    if k == 'construction_scale' and len(rs_dic.get(k, '')) < len(text):
+                        ser = re.search('^(%s):(?P<main>.+)'%kw, text)
+                        if ser:
+                            rs_dic[k] = ser.group('main')
+
+                n += 1
+                if 0<len(rs_dic['construction_scale'])<len(text) and rs_dic['construction_scale'][-1] not in [',', '。'] and text.find(rs_dic['construction_scale'])==0:
+                    scale_l.append(text)
+            if len(scale_l)==1 and len(scale_l[0])>len(rs_dic['construction_scale']): # 规则补充不完整规模信息 例:53334434
+                rs_dic['construction_scale'] = scale_l[0]
+            if 0<len(rs_dic['construction_scale'])<8 and re.search('([编代][号码]|名称|时间|日期|金额|单位|机构)$', rs_dic['construction_scale']):
+                rs_dic['construction_scale'] = ''
+
+            for k, v in rs_dic.items(): # 限制最大长度
+                if len(v)>500:
+                    v = v[:500]+'...后面省略%d字'%(len(v)-500)
+                    rs_dic[k] = v
+                if v == 'null':
+                    rs_dic[k] = ''
+
             rs_dic = {k: v for k, v in rs_dic.items() if v != ''}
             return [rs_dic]
         return []
@@ -7995,3 +8132,5 @@ if __name__=="__main__":
     # df['pos'] = df.apply(lambda x: 1 if x['label']==x['rule_label'] else 0, axis=1)
     # # df.to_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果_rule_predict.xlsx', index=False, columns=columns)
     # df.to_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果60000-90000_rule_predict.xlsx', index=False, columns=columns)
+    # print(get_header_line(['环评项目登记号','/','环评批文文号','金环许[2023]126号','环评批文日期']))
+    # print(get_header_line(['序号', '项目名称', '建设地点', '建设单位', '环评机构', '项目概况', '主要环境影响及预防或者减轻不良环境影响的对策和措施', '建设单位或地方政府作出的相关环保承诺', '公众反馈意见的联系方式']))