Ver Fonte

优化存款类项目招标金额;优化反馈问题

lsm há 7 meses atrás
pai
commit
cfe97d88bd

+ 5 - 0
.gitignore

@@ -9,6 +9,7 @@
 /BiddingKG/dl/product/data/
 /BiddingKG/dl/channel/data/
 /BiddingKG/dl_dev/test
+/BiddingKG/dl_dev/test2
 /BiddingKG/dl/test
 node_modules
 /BiddingKG/dl/table_head/train_data/
@@ -16,3 +17,7 @@ node_modules
 /BiddingKG/dl/table_head/checkpoints/
 /BiddingKG/dl/table_head/data_new.csv
 /BiddingKG/dl/table_head/has_table_no_attach.xlsx
+/BiddingKG/dl/LEGAL_ENTERPRISE.txt
+/BiddingKG/dl_dev/
+BiddingKG.iml
+misc.xml

+ 12 - 0
BiddingKG/dl/common/Utils.py

@@ -1134,6 +1134,18 @@ def is_all_winner(title):
         return 2
     return False
 
+def is_deposit_project(title, name, requirement):
+    '''
+    通过正则判断项目是否为银行存款类项目
+    :param title: 标题
+    :param name: 项目名称
+    :param requirement: 采购内容
+    :return:
+    '''
+    if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)((.{2,10}))?存放|存放银行|存款(服务|业务|项目)|国库现金管理|存款账户开户|(管理|存款|合作)(定点|专户)?银行|贷款合作银行|资金监管账户|开户银行项目|专户开户银行|银行专户选择|定期存[款放]|专项债券?专用账户', title+name+requirement):
+        return True
+    return False
+
 def recall(y_true, y_pred):
     '''
     计算召回率

+ 2 - 0
BiddingKG/dl/entityLink/entityLink.py

@@ -236,6 +236,8 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
                         _ent = long_entity[second_i]
                         if _ent.label in [0,1,5]:
                             if len(_entity.entity_text)<len(_ent.entity_text) and is_short(_entity.entity_text, _ent.entity_text):  # 简称顺序包含在工商名称内的替换
+                                if _entity.entity_text.endswith('大学'): # 修复 533357339 东北大学 替换为 中国银行沈阳东北大学支行
+                                    continue
                                 _entity.entity_text = _ent.entity_text
                                 lb, prob = bus_dic[_entity.entity_text]
                                 if lb in [0, 1] and prob > 0.9 and _entity.values[

+ 8 - 5
BiddingKG/dl/interface/Preprocessing.py

@@ -2970,6 +2970,9 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub('备选中标人', '第二候选人', article_processed)  # 341344142 # 2023/7/17 特殊表达修改
         article_processed = re.sub('例:建设银行(甲方全称)', ' ', article_processed)  # 2024/06/12 特殊表达修改 修改 481513912 金采网 附件模板导致错误提取招标人
         article_processed = re.sub('^[,,.。;;、]+', '', article_processed)
+        article_processed = re.sub('资,金', '资金', article_processed)
+        article_processed = re.sub('金,额', '金额', article_processed)
+        article_processed = re.sub('存,款', '存款', article_processed)
         if web_source_no.startswith('DX002756-'):
             article_processed = re.sub('状态:(进行中|已结束)单位', ',项目单位', article_processed)  # 376225646
         if web_source_no.startswith('DX006116-') and re.search('结果公告如下:.{5,50},单位名称:', article_processed):  # 2023/11/20 特殊处理 381591924 381592533 这种提取不到情况
@@ -3361,16 +3364,16 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
 
             entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text)
             # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
-            if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算|投资额',
-                         sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
+            if re.search('总投资|投资总额|总预算|总概算|(投资|招标|资金|存放|操作|融资)规模|批复概算|投资额|总规模|工程造价|总金额',
+                         sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额  20241031工程造价作总投资
                 # print('总投资金额: ', _match.group(0))
                 notes = '总投资'
             elif re.search('投资|概算|建安费|其他费用|基本预备费',
                            sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
                 notes = '投资'
-            elif re.search('工程造价',
-                           sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
-                notes = '工程造价'
+            # elif re.search('工程造价',
+            #                sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
+            #     notes = '工程造价'
             elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
                   or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\((]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\))]*[::为]',
                                sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])

+ 85 - 14
BiddingKG/dl/interface/extract.py

@@ -27,7 +27,7 @@ import BiddingKG.dl.complaint.punish_predictor as punish_rule
 import json
 from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
 from BiddingKG.dl.ratio.re_ratio import extract_ratio
-from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_parameters, extract_sentence_list
+from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_parameters, extract_sentence_list, extract_addr
 from BiddingKG.dl.interface.get_label_dic import get_all_label
 
 
@@ -267,10 +267,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     '''大纲提取及大纲内容相关提取'''
     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
-    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines= extract_parameters(parse_document, list_articles[0].content)
+    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope = extract_parameters(parse_document)
     if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
         parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
-        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines = extract_parameters(parse_document, list_articles[0].content)
+        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope = extract_parameters(parse_document)
+    if addr_bidopen_text == '':
+        addr_bidopen_text = extract_addr(list_articles[0].content)
 
     # 过滤掉Redis里值为0的错误实体
     # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
@@ -306,8 +308,11 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     log("get product attributes done of doc_id%s"%(doc_id))
     cost_time["product_attrs"] = round(time.time()-start_time,2)
 
+    # 是否为存款类项目
+    deposit_project = is_deposit_project(title, codeName[0]['name'], requirement_text)
+
     start_time = time.time() #正则角色提取
-    predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName, channel_dic, all_winner=is_all_winner(title))
+    predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName, channel_dic, all_winner=is_all_winner(title), req_scope=requirement_scope, deposit_project=deposit_project)
     cost_time["rule"] = round(time.time()-start_time,2)
 
     '''正则补充最后一句实体日期格式为招标或代理 2021/12/30;正则最后补充角色及去掉包含 公共资源交易中心 的招标人'''
@@ -391,7 +396,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
         prem[0]['prem'] = {}  # 审批项目不要这项
 
     else:
-        channel_dic, msc = predictor.getPredictor("channel").final_change(channel_dic, prem[0], title, text, original_docchannel, msc)
+        channel_dic, msc = predictor.getPredictor("channel").final_change(channel_dic, prem[0], original_docchannel, msc)
     # print('msc', msc)
     cost_time["rule_channel2"] = round(time.time()-start_time,2)
 
@@ -450,7 +455,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-10-17'}
+    version_date = {'version_date': '2024-11-07'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:
@@ -517,6 +522,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     # 限制产品数量
     data_res['product'] = data_res['product'][:500]
     data_res['product_attrs']['data'] = data_res['product_attrs']['data'][:500]
+    # 是否为存款项目
+    data_res['is_deposit_project'] = deposit_project
 
     # for _article in list_articles:
     #         log(_article.content)
@@ -585,14 +592,78 @@ if __name__=="__main__":
     #     print(rs['product_attrs'])
     # print(rs)
 
-    with open('2.html', 'r', encoding='utf-8') as f:
-        text = f.read()
-        t1 = time.time()
-        print(predict('', text, title))
-        t2 = time.time()
-        print(predict('', text, title))
-        t3 = time.time()
-        print('第一次耗时:%.4f, 第二次耗时:%.4f'%(t2-t1, t3-t2))
+    # df2 = pd.read_csv('E:/导出数据/存款入围框架采购等公告_输入要素.csv')
+    # df = pd.read_csv('E:\导出数据/存款入围框架采购等公告_预测结果0830.csv')
+    # df1 = pd.read_csv('E:\导出数据/存款入围框架采购等公告_html.csv')
+    # df = df.merge(df1, on='docid', how='left')
+    # print(len(df), df.columns)
+    # df['rs'] = df['extract_json1'].apply(lambda x: json.loads(x))
+    # docids = []
+    # n1 = n2 = 0
+    # for docid, d, html in zip(df['docid'], df['rs'], df['dochtmlcon']):
+    #     if d['docchannel']['docchannel'] == '招标公告' and '元' in html and 'Project' in d['prem'] and float(
+    #             d['prem']['Project']['tendereeMoney']) == 0:
+    #         docids.append(docid)
+    #         n1 += 1
+    #     else:
+    #         n2 += 1
+    # print(n1, n2)
+    # df = df[df['docid'].isin(docids)]
+    #
+    # df = df.merge(df2, on='docid', how='left')
+    # df.fillna('', inplace=True)
+    # df = df[['docid', 'doctitle', 'page_time', 'web_source_no', 'web_source_name', 'original_docchannel', 'dochtmlcon']]
+    # print(df.columns)
+    # # df = df[:10]
+    # print(len(df))
+    #
+    # l = []
+    # for docid, text, title, page_time,web_no, web_name, channel in zip(df['docid'], df['dochtmlcon'], df['doctitle'], df['page_time'],
+    #                                                                    df['web_source_no'], df['web_source_name'], df['original_docchannel']):
+    #     rs, content, roles = predict('', text, title, page_time, web_no, web_name, channel)
+    #     l.append((docid, rs, content, roles))
+    # df = pd.DataFrame(l, columns=['docid', 'rs', 'content', 'roles'])
+    # df.to_csv('E:\导出数据/存款入围框架采购等公告_招标公告无招标金额预测结果.csv')
+
+    df = pd.read_csv('E:\角色金额数据/银行类招标金额缺失公告_输入要素.csv')
+    # df2 = pd.read_csv('E:/角色金额数据/银行缺招标金额公告_3_html.csv')
+    df2 = pd.read_csv('E:/角色金额数据/银行缺招标金额公告_012_html.csv')
+    print(len(df), len(df2))
+    df = df[df['docid'].isin(df2['docid'])]
+    print(len(df))
+    df = df.merge(df2, how='left', on='docid')
+    print(len(df))
+
+    # df1 = pd.read_excel('E:\角色金额数据/银行缺招标金额公告_检查汇总2.xlsx')
+    # # df1 = pd.read_excel('E:\角色金额数据/银行缺招标金额公告_检查汇总2_补充招标内容金额后1105.xlsx')
+    # df1.fillna('', inplace=True)
+    # df1 = df1[df1['tendereeMoney']==0]
+    # df = df.merge(df1, on='docid', how='right')
+
+    df = df[['docid', 'doctitle', 'page_time', 'web_source_no', 'web_source_name', 'original_docchannel', 'dochtmlcon']]
+    print(df.columns)
+    # df = df[:10]
+    print(len(df))
+
+    l = []
+    for docid, text, title, page_time,web_no, web_name, channel in zip(df['docid'], df['dochtmlcon'], df['doctitle'], df['page_time'],
+                                                                       df['web_source_no'], df['web_source_name'], df['original_docchannel']):
+        rs, content, roles = predict('', text, title, page_time, web_no, web_name, channel)
+        l.append((docid, rs, content, roles))
+    df = pd.DataFrame(l, columns=['docid', 'rs', 'content', 'roles'])
+    # df.to_csv('E:\角色金额数据/银行缺招标金额公告_3预测结果.csv')
+    # df.to_csv('E:\角色金额数据/银行缺招标金额公告_012预测结果.csv')
+    # df.to_csv('E:\角色金额数据/银行缺招标金额公告_检查汇总2_修复后预测结果.csv')
+    df.to_csv('E:\角色金额数据/银行缺招标金额公告_检查汇总2_修复后预测结果1106.csv')
+
+    # with open('2.html', 'r', encoding='utf-8') as f:
+    #     text = f.read()
+    #     t1 = time.time()
+    #     print(predict('', text, title))
+    #     t2 = time.time()
+    #     print(predict('', text, title))
+    #     t3 = time.time()
+    #     print('第一次耗时:%.4f, 第二次耗时:%.4f'%(t2-t1, t3-t2))
     # print(predict('',text,title))
 
     # df = pd.read_excel('E:/大网站规则识别/大网站要素提取结果2.xlsx')[:]

+ 1 - 1
BiddingKG/dl/interface/getAttributes.py

@@ -4449,7 +4449,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, a
 
     moneys = []
     moneys_attachment = []
-    if channel_dic['docchannel']['docchannel'] in ['中标信息','候选人公示','合同公告'] and 'win_tenderer' in str(prem):
+    if channel_dic['docchannel']['life_docchannel'] in ['中标信息','候选人公示','合同公告'] and 'win_tenderer' in str(prem):
         sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index)
         finalists = [] # 入围供应商
         multi_winner_l = [] # 保存中标人名称列表

+ 11 - 10
BiddingKG/dl/interface/htmlparser.py

@@ -534,16 +534,17 @@ class ParseDocument():
                                 has_product = True
                                 break
 
-            if _type=="sentence":
-                if sentence_title is None and len(list_data)>0 and list_data[-1]["sentence_title"] is not None and list_data[-1]["line_width"]>=max_length*0.6:
-                    list_data[-1]["text"] += _text
-                    list_data[-1]["line_width"] = len(_text)
-                    _append = True
-                elif sentence_title is None and len(list_data)>0 and _type==list_data[-1]["type"]:
-                    if list_data[-1]["line_width"]>=max_length*0.7:
-                        list_data[-1]["text"] += _text
-                        list_data[-1]["line_width"] = len(_text)
-                        _append = True
+            # 合并两个非标题句子 20241106 注销,由于 485441521 招标内容结束位置不对
+            # if _type=="sentence":
+            #     if sentence_title is None and len(list_data)>0 and list_data[-1]["sentence_title"] is not None and list_data[-1]["line_width"]>=max_length*0.6:
+            #         list_data[-1]["text"] += _text
+            #         list_data[-1]["line_width"] = len(_text)
+            #         _append = True
+            #     elif sentence_title is None and len(list_data)>0 and _type==list_data[-1]["type"]:
+            #         if list_data[-1]["line_width"]>=max_length*0.7:
+            #             list_data[-1]["text"] += _text
+            #             list_data[-1]["line_width"] = len(_text)
+            #             _append = True
 
             if _type=="table":
                 _soup = BeautifulSoup(_text,"lxml")

+ 24 - 13
BiddingKG/dl/interface/outline_extractor.py

@@ -52,7 +52,6 @@ def extract_sentence_list(sentence_list):
                 new_sentence2_list_attach.append(sentence2)
             else:
                 new_sentence2_list.append(sentence2)
-
     return new_sentence2_list, new_sentence2_list_attach
 
 requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
@@ -63,18 +62,18 @@ addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|
 addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([::,]|$)"
 out_lines = []
 
-def extract_parameters(parse_document, content):
+def extract_parameters(parse_document):
     '''
     通过大纲、预处理后文本正则获取需要字段
     :param parse_document: ParseDocument() 方法返回结果
-    :param content: 公告预处理后文本
     :return:
     '''
     list_data = parse_document.tree
-    requirement_text = ''
-    aptitude_text = ''
-    addr_bidopen_text = ''
-    addr_bidsend_text = ''
+    requirement_text = '' # 采购内容
+    aptitude_text = '' # 资质要求
+    addr_bidopen_text = '' # 开标地址
+    addr_bidsend_text = '' # 投标地址
+    requirement_scope = [] # 采购内容始末位置
 
     _find_count = 0
     _data_i = -1
@@ -86,14 +85,18 @@ def extract_parameters(parse_document, content):
         # print(_data.keys())
         if _type=="sentence":
             if _data["sentence_title"] is not None:
-                if re.search('[((][一二三四五六七八九十]+[))]|[一二三四五六七八九十]+\s*、', _text[:10]):
+                if re.search('[((][一二三四五六七八九十}]+[))]|[一二三四五六七八九十]+\s*、|^\d{1,2}[.、][\u4e00-\u9fa5]', _text[:10]):
                     out_lines.append((_text, _data['sentence_index'], _data['wordOffset_begin']))
 
                 if re.search(requirement_pattern,_text[:30]) is not None and re.search('符合采购需求,', _text[:30])==None:
+                    b = (_data['sentence_index'], _data['wordOffset_begin'])
                     childs = get_childs([_data])
                     for c in childs:
                         # requirement_text += c["text"]+"\n"
                         requirement_text += c["text"]
+                    e = (c['sentence_index'], c["wordOffset_end"]) if len(childs)>0 else (_data['sentence_index'], _data['wordOffset_end'])
+                    requirement_scope.append(b)
+                    requirement_scope.append(e)
                     _data_i += len(childs)
                     _data_i -= 1
     _data_i = -1
@@ -161,15 +164,23 @@ def extract_parameters(parse_document, content):
         addr_bidopen_text = addr_bidopen_text[b:e]
     elif re.search('开启', addr_bidopen_text) and re.search('时间:\d{2,4}年\d{1,2}月\d{1,2}日', addr_bidopen_text) and len(addr_bidopen_text)<40: # 优化类似 364991684只有时间没地址情况
         addr_bidopen_text = ""
-    if addr_bidopen_text == "":
-        ser = re.search('([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件))?(会议)?地[点址]([((]网址[))])?[:为][^,;。]{2,100}[,;。]', content)
-        if ser:
-            addr_bidopen_text = ser.group(0)
     if re.search('时间:', addr_bidsend_text) and re.search('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidsend_text):
         for ser in re.finditer('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidsend_text):
             b, e = ser.span()
         addr_bidsend_text = addr_bidsend_text[b:e]
-    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines
+    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope
+
+def extract_addr(content):
+    '''
+    通过正则提取地址
+    :param content:  公告预处理后文本
+    :return:
+    '''
+    addr_bidopen_text = ''
+    ser = re.search('([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件))?(会议)?地[点址]([((]网址[))])?[:为][^,;。]{2,100}[,;。]', content)
+    if ser:
+        addr_bidopen_text = ser.group(0)
+    return addr_bidopen_text
 
 if __name__ == "__main__":
     # with open('D:\html/2.html', 'r', encoding='UTF-8') as f:

+ 118 - 49
BiddingKG/dl/interface/predictor.py

@@ -872,7 +872,7 @@ class PREMPredict():
                 elif re.search('^,?(投标报价|(资格性审查:|符合性审查:)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]名', front)==None:
                     values[2] = 0.5
                     label = 5
-                elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$|确定为标的的受让方,$|[主次出]入口?,?$|确定(项目|\w{,2})成交供应商,$|,承刻单位:$|乙方接受为$', front):  # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位: 241929628 1月9,承刻单位: 肃宁县超凡网络光敏印章刻印部 ,印章预留印模
+                elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$|确定为标的的受让方,$|[主次出]入口?,?$|确定(项目|\w{,2})成交供应商,$|,承刻单位:$|乙方接受为$|丙方:$', front):  # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位: 241929628 1月9,承刻单位: 肃宁县超凡网络光敏印章刻印部 ,印章预留印模
                     label = 5
                 elif re.search(',来源:$', front) and re.search('^,', behind): # 修复 472062585 项目采购-关于定制手机询比价采购中标公告,来源:深圳市网联安瑞网络科技有限公司 预测为中标
                     label = 0
@@ -985,12 +985,15 @@ class PREMPredict():
                 elif entity.notes == '单价' and float(entity.entity_text)<5000:
                     label = 2
             elif label ==0: # 错误招标金额处理
-                if entity.notes in ["投资", "总投资","工程造价"] or re.search('最低限价:?$|注册资本', front) or re.search('服务内容:([\d,.]+万?亿?元?-?)$', front):
+                if re.search('投资(金额|规模):$', front): # 545988699 金额不大的投资金额作为备选招标金额
+                    values[label] = 0.51
+                elif entity.notes in ["投资", "总投资","工程造价"] or re.search('最低限价:?$|注册资本', front) or re.search('服务内容:([\d,.]+万?亿?元?-?)$', front):
                     values[label] = 0.49
+                    label = 2
                 elif re.search('^(以[上下])?按[\d.%]+收取|^及?以[上下]|^[()]?[+×*-][\d.%]+|(含)', behind):
                     values[label] = 0.49
-                elif re.search('(含|在|包括|[大小等高低]于|如预算金额为)$|[\d.%]+((含))?[+×*-]$', front):
-                    values[label] = 0.49
+                # elif re.search('(含|在|包括|[大小等高低]于|如预算金额为)$|[\d.%]+((含))?[+×*-]$', front):  # 2024/10/30 注销,避免漏提 预算金额:控制在26000元以内由合作银行出资 ;投资金额不低于人民币500万元
+                #     values[label] = 0.49
                 elif entity.notes == '单价' and float(entity.entity_text)<5000:
                     label = 2
             elif re.search('报价:预估不?含税总价[为:]$', front) and (label != 1 or values[label]<0.5):
@@ -1423,11 +1426,11 @@ class RoleRulePredictor():
         self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
         self.pattern_winTenderer_left_50 = "(?P<winTenderer_left_51>" \
-               "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?|银行)(:?单位名称|:?名称|盖章)?[::是为]+$" \
+               "(乙|竞得|受让|买受|签约|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?|银行)(:?单位名称|:?名称|盖章)?[::是为]+$" \
                "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致|征集结果|选择中介|选择结果|成交对象|勘察人|(,|审计|处置|勘察|设计)服务单位|受托[人方])[::是为]+$" \
                "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$|竞争性选择申请人名称:$" \
                "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(中标|成交)供应商、(中标|成交)(金额|价格),$" \
-               "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$|经讨论,决定采用$)"  # 承办单位:不作为中标 83914772
+               "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$|经讨论,决定采用$)"  # 承办单位:不作为中标 83914772  |施工 单位不作为中标人 例:386692187
         self.pattern_winTenderer_left_60 = "(?P<winTenderer_left_60>" \
                                            "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
                                            "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$|选取(情况|说明):中选,中介机构名称:$|排名如下:1、$|第[一1]名,?投标(人|单位|银行|公司):$)"  # 解决表头识别不到加逗号情况,需前面为,。空 20240621补充 中选 云南省投资审批中介超市 补充排名如下 南阳师范学院
@@ -1450,7 +1453,7 @@ class RoleRulePredictor():
         self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
         self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
 
-        self.condadate_left = "(?P<candidate_left>(((中标|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人)?[::是为]+$)"
+        self.condadate_left = "(?P<candidate_left>(((中[选商]|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人|[及与和](成交|中标)金额)?[::是为]+$)"
 
         self.pattern_left = [
             self.pattern_tenderee_left_60,
@@ -1479,10 +1482,12 @@ class RoleRulePredictor():
 
         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
         
-        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?[为:]+\w{2,4}资金|采购成本价|总费用约?为|招标规模")  # |建安费用 不作为招标金额
+        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?[为:]+\w{2,4}资金|采购成本价|总费用约?为|(招标|采购)总?(规模|额度|资金)|资金来源")  # |建安费用 不作为招标金额
         self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(综合)?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):|经评审的价格")  # 单写 总价 不能作为中标金额,很多表格有单价、总价
         self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元(报价)?(中标|中选|成交)")
         self.pattern_money_other = re.compile("代理费|服务费")
+        self.pattern_money_bank_tenderee = "存[款放](操作)?,?总?(金额|总额|规模|额度|资金)|招标的?资金总量|(项目|资金)总?(规模|额度)|现金管理的?(操作)?(额度|规模|总额)|定期存款|存款大?约|定期存储|竞争性存放|项目资金|日均存款|资金现状|存量金额|招标分配的资金量|资金总[量额]|总(规模|额度|金额)|投资金额" # 存款类招标金额
+        self.pattern_money_bank_tenderee_right = "^,?(提供定期存放服务|存[款放](期限|时间)|存期|结构性存款|期限|\w{,4}(定期存款|公款存放|资金存放))" # 存款类招标金额
         self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
         # self.role_file = open('/data/python/lsm/role_rule_predict.txt', 'a', encoding='utf-8')
 
@@ -1561,7 +1566,20 @@ class RoleRulePredictor():
         return (_label, _prob, _flag, keyword)
 
 
-    def predict(self, list_articles, list_sentences, list_entitys, list_codenames, channel_dic, on_value=0.5, all_winner=False):
+    def predict(self, list_articles, list_sentences, list_entitys, list_codenames, channel_dic, on_value=0.5, all_winner=False, req_scope=[], deposit_project=False):
+        '''
+
+        :param list_articles:
+        :param list_sentences:
+        :param list_entitys:
+        :param list_codenames:
+        :param channel_dic:
+        :param on_value: 最低阈值
+        :param all_winner: 是否存款、入围等公告,不分排名作为中标人
+        :param req_scope: 大纲采购内容开始结束位置[((开头句子index, 开头位置), (结束句子index, 结束句子位置)]
+        :param is_deposit_project: 是否为银行存款类项目
+        :return:
+        '''
 
         for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
                                                                       list_codenames):
@@ -1571,6 +1589,7 @@ class RoleRulePredictor():
             name_entitys = [] # 2023/6/30 保存项目名称实体,直接通过位置判断角色是否在项目名称里面
             candidates = [] # 保存不能确定为第几的候选人 2023/04/14
             notfound_tenderer = True  # 未找到前三候选人
+            deposit_moneys = []  # 保存存款类项目采购内容中大于百万的其他金额实体
             for entity in list_entity:
                 if entity.entity_type == 'name':
                     list_name.append(entity.entity_text)
@@ -1587,10 +1606,12 @@ class RoleRulePredictor():
                         find_flag = False
                         for _sentence in list_sentence:
                             if _sentence.sentence_index == p_entity.sentence_index:
-                                _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
-                                                   end_index=p_entity.end_index, size=20, center_include=True,
-                                                   word_flag=True, use_text=True,
-                                                   text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text)))
+                                # _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
+                                #                    end_index=p_entity.end_index, size=20, center_include=True,
+                                #                    word_flag=True, use_text=True,
+                                #                    text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text)))
+                                _span = get_context(_sentence.sentence_text, p_entity.wordOffset_begin,
+                                                    p_entity.wordOffset_end, size=20, center_include=True)
                                 if re.search(self.pattern_tenderee_left_50, _span[0]) or re.search(self.pattern_tenderee_left_55, _span[0]): # 前面有关键词的实体不判断是否在项目名称中出现
                                     find_flag = True
                                     break
@@ -1636,9 +1657,11 @@ class RoleRulePredictor():
                             if str(_name).find(p_entity.entity_text) >= 0 and p_entity.sentence_index < 4:
                                 for _sentence in list_sentence:
                                     if _sentence.sentence_index == p_entity.sentence_index:
-                                        _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
-                                                           end_index=p_entity.end_index, size=20, center_include=True,
-                                                           word_flag=True, use_text=True, text=p_entity.entity_text)
+                                        # _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
+                                        #                    end_index=p_entity.end_index, size=20, center_include=True,
+                                        #                    word_flag=True, use_text=True, text=p_entity.entity_text)
+                                        _span = get_context(_sentence.sentence_text, p_entity.wordOffset_begin,
+                                                            p_entity.wordOffset_end, size=20, center_include=True)
                                         if _span[2].startswith(":"): # 实体后面为冒号的不作为招标人,避免项目名称出错中标变招标  368122675 陇西兴恒建建筑有限责任公司:线路安全保护区内环境治理专项整改(第二标段)项目
                                             break
                                         if str(_span[0][-len(str(_name)):]+_span[1] + _span[2][:len(str(_name))]).find(
@@ -1798,37 +1821,74 @@ class RoleRulePredictor():
                     if str(p_entity.label) == "2":
                         for _sentence in list_sentence:
                             if _sentence.sentence_index == p_entity.sentence_index:
-                                _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
-                                                   end_index=p_entity.end_index, size=10, center_include=True,
-                                                   word_flag=True, text=p_entity.entity_text)
+                                # _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
+                                #                    end_index=p_entity.end_index, size=10, center_include=True,
+                                #                    word_flag=True, text=p_entity.entity_text)
+                                _span = get_context(_sentence.sentence_text, p_entity.wordOffset_begin, p_entity.wordOffset_end, size=20, center_include=True) # 20241101 修复spanWindow方法取词错误, ['金额(万元:', '27000,存', '期:3个月,四、投标人资格:1.在嘉兴']
                                 if re.search('(含|在|包括)(\d+)?$', _span[0]):
                                     continue
-                                if re.search(',\w{2,}', _span[0]):
-                                    _span[0] = _span[0].split(',')[-1] if len(_span[0].split(',')[-1])>4 else _span[0][-8:] #避免多个价格在一起造成误判
                                 if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
                                         self.pattern_money_other, _span[0]) is None:
-                                    p_entity.values[0] = 0.8 + p_entity.values[0] / 10
+                                    front_text = _span[0][re.search(self.pattern_money_tenderee, _span[0]).end():]
+                                    if re.search('\d[万亿]?元|元)?:?\d', front_text): # 当前金额与关键词中间有金额的过滤掉
+                                        break
+                                    p_entity.values[0] = 0.62 + p_entity.values[0] / 10
                                     p_entity.label = 0
-                                    # print('规则召回预算金额:', p_entity.entity_text, _span[0])
+                                elif deposit_project:
+                                    if re.search(self.pattern_money_bank_tenderee,
+                                                   _span[0]) is not None and re.search(
+                                        self.pattern_money_other, _span[0]) is None:
+                                        front_text = _span[0][re.search(self.pattern_money_bank_tenderee, _span[0]).end():]
+                                        if re.search('\d[万亿]?元|元)?:?\d', front_text):  # 当前金额与关键词中间有金额的过滤掉
+                                            break
+                                        p_entity.values[0] = 0.6 + p_entity.values[0] / 10
+                                        p_entity.label = 0
+                                    elif re.search(self.pattern_money_bank_tenderee_right, _span[2]):
+                                        p_entity.values[0] = 0.55 + p_entity.values[0] / 10
+                                        p_entity.label = 0
+                                    elif (re.search('存款|总额度', _span[0]) or re.search('存[款放]|专项债资金', _span[2])):
+                                        front_text = _span[0][(re.search('存款|总额度', _span[0]) or re.search('存[款放]|专项债资金', _span[2])).end():]
+                                        if re.search('\d[万亿]?元|元)?:?\d', front_text):  # 当前金额与关键词中间有金额的过滤掉
+                                            break
+                                        p_entity.values[0] = 0.55
+                                        p_entity.label = 0
+                                        # print('规则召回预算金额 4:', p_entity.entity_text, _span[0],p_entity.values[0])
                                 if re.search(self.pattern_money_tenderer, _span[0]) is not None:
+                                    front_text = _span[0][re.search(self.pattern_money_tenderer, _span[0]).end():]
+                                    if re.search('\d[万亿]?元|元)?:?\d', front_text):  # 当前金额与关键词中间有金额的过滤掉
+                                        break
                                     if re.search(self.pattern_money_other, _span[0]) is not None:
                                         if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
                                                 re.search(self.pattern_money_other, _span[0]).span()[1]:
-                                            p_entity.values[1] = 0.8 + p_entity.values[1] / 10
+                                            p_entity.values[1] = 0.6 + p_entity.values[1] / 10
                                             p_entity.label = 1
                                     else:
-                                        p_entity.values[1] = 0.8 + p_entity.values[1] / 10
+                                        p_entity.values[1] = 0.6 + p_entity.values[1] / 10
                                         p_entity.label = 1
-                                if re.search(self.pattern_money_tenderer_whole,
-                                             "".join(_span)) is not None and re.search(self.pattern_money_other,
-                                                                                       _span[0]) is None:
-                                    p_entity.values[1] = 0.8 + p_entity.values[1] / 10
+                                if re.search(self.pattern_money_tenderer_whole,"".join(_span)) and re.search(self.pattern_money_tenderer_whole, _span[0])==None \
+                                        and  re.search(self.pattern_money_tenderer_whole, _span[2])==None and re.search(self.pattern_money_other,_span[0])==None:
+                                    p_entity.values[1] = 0.6 + p_entity.values[1] / 10
                                     p_entity.label = 1
                                 elif re.search('(预算金额|最高(投标)?上?限[价额]?格?|招标控制价))?:?([\d.,]+万?元[,(]其中)?(第?[一二三四五0-9](标[段|包]|[分子]包):?[\d.,]+万?元,)*第?[一二三四五0-9](标[段|包]|[分子]包):?$'
                                         , _sentence.sentence_text[:p_entity.wordOffset_begin]): # 处理几个标段金额相邻情况 例子:191705231
-                                    p_entity.values[0] = 0.8 + p_entity.values[0] / 10
+                                    p_entity.values[0] = 0.6 + p_entity.values[0] / 10
                                     p_entity.label = 0
-                                    # print('规则召回预算金额2:', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin])
+                    if deposit_project and p_entity.label in [1,2]:
+                        if req_scope and float(p_entity.entity_text)>1000000 and (p_entity.sentence_index>req_scope[0][0]\
+                            or (p_entity.sentence_index==req_scope[0][0] and p_entity.wordOffset_begin>req_scope[0][1])) and (p_entity.sentence_index<req_scope[1][0]\
+                            or (p_entity.sentence_index==req_scope[1][0] and p_entity.wordOffset_end<=req_scope[1][1])):
+                            deposit_moneys.append(p_entity)
+
+            if deposit_moneys:
+                moneys = [float(p.entity_text) for p in deposit_moneys]
+                for p in deposit_moneys:
+                    if float(p.entity_text)==max(moneys):
+                        p.values[0] = 0.55
+                        p.label = 0
+                    else:
+                        p.values[0] = 0.5
+                        p.label = 0
+
             if notfound_tenderer and len(set([ent.entity_text for ent in candidates])) == 1 and channel_dic['docchannel']['docchannel'] in ['中标信息', '候选人公示', '合同公告']:
                 for p_entity in candidates:
                     # print('只有一个候选人的作为中标人', p_entity.entity_text)
@@ -1841,14 +1901,16 @@ class RoleRulePredictor():
             for p_entity in list_entity:
                 for _sentence in list_sentence:
                     if _sentence.sentence_index == p_entity.sentence_index:
-                        _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
-                                           end_index=p_entity.end_index, size=20, center_include=True, word_flag=True,
-                                           text=p_entity.entity_text)
-
+                        # _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
+                        #                    end_index=p_entity.end_index, size=20, center_include=True, word_flag=True,
+                        #                    text=p_entity.entity_text)
+                        _span = get_context(_sentence.sentence_text, p_entity.wordOffset_begin, p_entity.wordOffset_end,
+                                            size=30, center_include=True)
                         if state == 2:
                             for _p in list_p[1:]:
-                                _p.values[0] = 0.8 + _p.values[0] / 10
-                                _p.label = 0
+                                if _p.label == 2:
+                                    _p.values[0] = 0.5 + _p.values[0] / 10
+                                    _p.label = 0
                             state = 0
                             list_p = []
 
@@ -1873,8 +1935,9 @@ class RoleRulePredictor():
             if len(list_p) > 1:
                 for _p in list_p[1:]:
                     # print("==",_p.entity_text,_p.sentence_index,_p.label)
-                    _p.values[0] = 0.8 + _p.values[0] / 10
-                    _p.label = 0
+                    if _p.label == 2:
+                        _p.values[0] = 0.5 + _p.values[0] / 10
+                        _p.label = 0
                 state = 0
                 list_p = []
 
@@ -4010,7 +4073,7 @@ class DocChannel():
           '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地',
           '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍',
           '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让',
-          '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询|调研)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|竞价采购|(设备|服务)采购|网上超市采购|定点采购',
+          '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询|调研)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|(设备|服务|\w{2})[直采]购|(建设|改造)项目|工程|拦标价|控制价',
           # |竞价 采招/产权都有竞价方式 # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
           '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)|行政审批结果'
       }
@@ -4018,7 +4081,7 @@ class DocChannel():
           '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
           '采购意向neg': '发布政府采购意向|采购意向公告已于',
           '招标预告': '(预计|计划)(采购|招标)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
-          '招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|要求|\w{,5}材料)[:\s]|[^\w]成交规则|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格要求|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)|评选方式:?\s*价格最低',
+          '招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|要求|\w{,5}材料)[:\s]|[^\w]成交规则|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格(要求|条件)|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)|评选方式:?\s*价格最低',
           '资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过原因', #|资格
           '招标答疑': '现澄清(为|如下)|答疑补遗|澄清内容如下|第[0-9一二三四五]次澄清|答疑澄清|(最高(投标)?限价|控制价|拦标价)公示',  # |异议的回复
           '公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)',
@@ -4032,8 +4095,8 @@ class DocChannel():
       # |确定成交供应商[:,\s]
           '合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|完成(日期|时间))|(供应商乙方|乙方供应商):|合同总?金额|履约信息',
           '废标公告': '(终止|中止|废标|流标|流采|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标|废置)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
-          '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?(原因|理由)|本项目因故取消|本(项目|次)(公开)?\w{2}失败|已终止\s*原因:|(人|人数|供应商|单位)(不足|未达\w{,3}数量)|已终止|不足[3三]家|无(废标)|成交情况:\s*[流废]标|现予以废置',
-          '废标公告neg': '超过此报价将作为[废流]标处理|否则按[废流]标处理|终止规则:|成交规则:|视为流标|竞价失败的一切其他情形|是否废标:否|若不足三家公司参与|供应商数量:?\s*报名供应商不足三家|有效报价不足三家,\s*系统自动废标' # 503076535 供应商数量: 报名供应商不足三家。
+          '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?(原因|理由)|本项目因故取消|本(项目|次)(公开)?\w{2}失败|已终止\s*原因:|(人|人数|供应商|单位)(不足|未达\w{,3}数量)|已终止|不足[3三]家|无(废标)|成交情况:\s*[流废]标|现予以废置|报名未够三家',
+          '废标公告neg': '超过此报价将作为[废流]标处理|否则按[废流]标处理|终止规则:|成交规则:|视为流标|竞价失败的一切其他情形|是否废标:否|若不足三家公司参与|供应商数量:?\s*报名供应商不足三家|有效报价不足三家,\s*系统自动废标|如遇项目流[标采]' # 503076535 供应商数量: 报名供应商不足三家。
       }
       self.title_life_dic = {
           '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示|意向公开',
@@ -4043,11 +4106,11 @@ class DocChannel():
           '废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|撤回|取消成?交?|流拍|停止)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置',
           '合同公告': '(合同(成交|变更)?)(公告|公示|信息|公式|公开|签订)|合同备案|合同书|合同$', # |(履约|验收)(结果)?
           '候选人公示': '候选人(变更)?公示|评标(结果)?(公[告示]|报告)|评审结果', #中标前公示|中标预公示|
-          '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$|项目中标', # |开标(记录|信息|情况)
+          '中标信息': '(中标|中选|中价|中租|成交)?|入选|确认)(候选人|人|供应商|记录|结果|变更|情况)?的?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|磋商|交易|出让|抽取|抽签)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$|项目中标|(项目|工程|服务|定点)的?结果公[告示]|超市直购订单', # |开标(记录|信息|情况)
           '资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
           '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
           '开标记录': '开标记录|截标信息|评委名单公示|开标安排|开标数据表|开标信息|开标情况|开标一览表|开标结果|开标会',
-          '验收合同': '(验收|履约)(公告|公示)|(验收|履约)(结果|报告|意见|单)(公告|公示)'
+          '验收合同': '(验收|履约)(公告|公示)|(验收|履约)(结果|报告|意见|单)(公告|公示)|预留项目执行情况'
       }
 
   def load_life(self,life_model,config):
@@ -4565,6 +4628,9 @@ class DocChannel():
 
       text = html2text(html)
 
+      self.title = title
+      self.text = text
+
       result = {'docchannel': {'docchannel': '', 'doctype': ''}}
 
       doc_type, type_kw = get_type(title, text)
@@ -4602,7 +4668,7 @@ class DocChannel():
       # print('channel ', msc)
       return result, msc
 
-  def final_change(self, result, prem, title, text, original_docchannel, msc):
+  def final_change(self, result, prem, original_docchannel, msc):
       '''
 
       :param result: channel 结果字典
@@ -4635,6 +4701,8 @@ class DocChannel():
               return False
 
       origin_dic = self.origin_dic
+      title = self.title
+      text = self.text
       prem_json = json.dumps(prem, ensure_ascii=False)
       if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
               original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(
@@ -4650,8 +4718,8 @@ class DocChannel():
           original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
           result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
           msc += '最终规则修改:答疑公告标题无答疑关键且原始为招标,返回原始类别;'
-      elif result['docchannel']['docchannel'] == '招标公告' and is_contain_winner(prem_json) and origin_dic.get(
-              original_docchannel, '') == '中标信息':
+      elif result['docchannel']['docchannel'] == '招标公告' and is_contain_winner(prem_json) and (origin_dic.get(
+              original_docchannel, '') == '中标信息' or re.search('直接采购', title)): # 20241025补充 标题包含直接采购且有中标人的为中标公告
           result['docchannel']['docchannel'] = '中标信息'
           msc += '最终规则修改:预测为招标公告却有中标人且原始为中标改为中标信息;'
       elif result['docchannel']['docchannel'] in ['招标公告'] and origin_dic.get(
@@ -6028,11 +6096,12 @@ class DistrictPredictor():
             return province_l, city_l, district_l
 
         def get_pro_city_dis_score(text, text_weight=1):
-            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河', ' ', text)
+            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)', ' ', text) # 544151395 赤壁市老城区燃气管道老化更新改造
             text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
             text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域:怒江州 识别为广西 - 崇左 - 江州
             text = re.sub('茂名滨海新区', '茂名市', text)
             text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
+            text = re.sub('横州市', '横县', text) # 例:547363890 修复广西南宁横州 不在地区表问题
             ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
             if ser and '黎族' not in ser.group(0):
                 text = text.replace(ser.group(0), ser.group(0)+'黎族')