Kaynağa Gözat

新增联合体提取;限制某些类别最高金额;去除公共资源中心为招标人;优化标段规则;优化角色规则;优化channel分类

lsm 2 yıl önce
ebeveyn
işleme
ba691b8c1d

+ 3 - 3
BiddingKG/dl/interface/Preprocessing.py

@@ -1969,7 +1969,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
         article_processed = re.sub('任务(?=编号[::])', '项目',article_processed)  # 2022/08/10 修正为项目编号
         article_processed = article_processed.replace('招标(建设)单位', '招标单位')  #2022/8/10 修正预测不到表达
-        article_processed = re.sub('(招标|采购)人(概况|信息)[,。]', '采购人信息:', article_processed)  # 2022/8/10统一表达
+        article_processed = re.sub('(招标|采购)人(概况|信息):?[,。]', '采购人信息:', article_processed)  # 2022/8/10统一表达
         # 修复OCR金额中“,”、“。”识别错误
         article_processed_list = article_processed.split("##attachment##")
         if len(article_processed_list)>1:
@@ -2479,10 +2479,10 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
                     entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",entity_text)
                     # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
-                    if re.search('总投资|投资总额|总预算|总概算|投资规模', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
+                    if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
                         # print('总投资金额: ', _match.group(0))
                         notes = '总投资'
-                    elif re.search('投资', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
+                    elif re.search('投资|概算', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
                         notes = '投资'
                     elif re.search('工程造价', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
                         notes = '工程造价'

+ 20 - 1
BiddingKG/dl/interface/extract.py

@@ -148,7 +148,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
     cost_time["rule"] = round(time.time()-start_time,2)
 
-    '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
+    '''正则补充最后一句实体日期格式为招标或代理 2021/12/30;正则最后补充角色及去掉包含 公共资源交易中心 的招标人'''
     start_time = time.time() #正则角色提取
     predictor.getPredictor("roleRuleFinal").predict(list_articles,list_sentences,list_entitys, codeName)
     cost_time["roleRuleFinal"] = round(time.time()-start_time,2)
@@ -189,6 +189,9 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
 
+    '''获取联合体信息'''
+    getAttributes.get_win_joint(prem, list_entitys, list_sentences, list_articles)
+
     start_time = time.time() #失信数据要素提取
     list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
     cost_time["punish"] = round(time.time()-start_time,2)
@@ -202,6 +205,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     # content = list_articles[0].content
     # channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
     channel_dic, msc = predictor.getPredictor("channel").predict_merge(title,list_sentences[0], text, list_articles[0].bidway, prem[0], original_docchannel)
+    # print('msc', msc)
     cost_time["rule_channel"] = round(time.time()-start_time,2)
 
     start_time = time.time() # 产品名称及废标原因提取  #依赖 docchannel结果
@@ -219,6 +223,9 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     '''行业分类提取,需要用标题、项目名称、产品、及prem 里面的角色'''
     industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem)
 
+    '''限制行业最高金额'''
+    getAttributes.limit_maximum_amount(prem, industry)
+
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
@@ -255,6 +262,18 @@ def test(name,content):
     # print(resp_json)
     return resp_json
 
+def get_ent_context(list_sentences, list_entitys):
+    rs_list = []
+    sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
+    for list_entity in list_entitys:
+        for _entity in list_entity:
+            if _entity.entity_type in ['org', 'company', 'money']:
+                s = sentences[_entity.sentence_index].sentence_text
+                b = _entity.wordOffset_begin
+                e = _entity.wordOffset_end
+                # print("%s %d %.4f; %s  %s  %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
+                rs_list.append("%s %d %.4f; %s  %s  %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
+    return '\n'.join(rs_list)
 
 if __name__=="__main__":
     import pandas as pd

+ 104 - 5
BiddingKG/dl/interface/getAttributes.py

@@ -364,10 +364,10 @@ def get_dict_entity_prob(list_entity,on_value=0.5):
                         if entity.entity_text in identified_role:
                             continue
                     if _key_prob in dict_pack_entity_prob:
-                        new_prob = role_prob+dict_pack_entity_prob[_key_prob][1]
-                        dict_pack_entity_prob[_key_prob] = [entity.entity_text, new_prob] #公司同角色多次出现概率累计
-                        # if role_prob>dict_pack_entity_prob[_key_prob][1]:
-                        #     dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
+                        # new_prob = role_prob+dict_pack_entity_prob[_key_prob][1] if role_prob>0.9 else max(role_prob, dict_pack_entity_prob[_key_prob][1])
+                        # dict_pack_entity_prob[_key_prob] = [entity.entity_text, new_prob] #公司同角色多次出现概率累计
+                        if role_prob>dict_pack_entity_prob[_key_prob][1]:
+                            dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
                     else:
                         dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
     return dict_pack_entity_prob
@@ -575,7 +575,7 @@ def getPackagesFromArticle(list_sentence,list_entity):
     
     package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
     package_N_name_pattern = re.compile("(([^承]|^)分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
-    package_number_pattern = re.compile("(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")  # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
+    package_number_pattern = re.compile("(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|([^\.]?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(标[段号的包]))")  # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
     # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)')  # 新正则识别标段
     other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]')  #  # 2020/11/23 大网站规则 调整  package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
     win_tenderer_pattern = re.compile('(中标候?选?人|供应商)(名称)?[::](.{2,25})[,。]')  # 2020/11/23 大网站规则 调整
@@ -3022,6 +3022,105 @@ def correct_rolemoney(prem, total_product_money):
                 except Exception as e:
                     print('表格产品价格修正中标价格报错:%s'%e)
 
+def limit_maximum_amount(prem, industry):
+    indu = industry.get('class_name', '')
+    indu_amount = {
+        '计算机设备': 200000000,
+        '办公设备': 100000000,
+        '家具用具': 500000000,
+        '办公消耗用品及类似物品': 100000000,
+        '日杂用品': 100000000,
+        '餐饮业': 1000000000,
+        '物业管理': 1000000000,
+        '工程技术与设计服务': 1000000000,
+        '工程评价服务': 100000000,
+        '其他工程服务': 100000000,
+        '工程监理服务': 1000000000,
+        '工程造价服务': 100000000
+    }
+    if indu in indu_amount:
+        maximum_amount = indu_amount[indu]
+        try:
+            for value in prem[0]['prem'].values():
+                for l in value['roleList']:
+                    if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money']) > maximum_amount:
+                        if indu in ['餐饮业', '物业管理']:
+                            l["role_money"]['money'] = str(float(l["role_money"]['money'])/10000)
+                        elif l["role_money"]['money_unit'] == '万元':
+                            l["role_money"]['money'] = str(float(l["role_money"]['money'])/10000)
+                if float(value['tendereeMoney']) > maximum_amount:
+                    if indu in ['餐饮业', '物业管理']:
+                        value['tendereeMoney'] = float(value['tendereeMoney'])/10000
+                    elif value['tendereeMoneyUnit'] == '万元':
+                        value['tendereeMoney'] = float(value['tendereeMoney']) / 10000
+        except Exception as e:
+            print('行业分类限制最高金额抛出异常:%s' % e)
+
+def get_win_joint(prem, list_entitys, list_sentences, list_articles):
+    '''
+    获取联合体信息, 添加到prem
+    :param prem:
+    :param list_entitys:
+    :param list_sentences:
+    :param list_articles:
+    :return:
+    '''
+    try:
+        if 'win_tenderer' in str(prem) and re.search('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|(联合体)|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)', list_articles[0].content):
+            sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
+            for project in prem[0].values():
+                if not isinstance(project, dict):
+                    continue
+                for v in project.values():
+                    for d in v['roleList']:
+                        if d.get('role_name', '') == 'win_tenderer':
+                            winner = d.get('role_text')
+                            join_l = [winner]
+                            for list_entity in list_entitys:
+                                for i in range(len(list_entity)-1):
+                                    _entity = list_entity[i]
+                                    b = _entity.wordOffset_begin
+                                    e = _entity.wordOffset_end
+                                    if _entity.entity_type in ['org', 'company'] and _entity.label==2\
+                                            and _entity.entity_text==winner:
+                                        s = sentences[_entity.sentence_index].sentence_text
+                                        for j in range(i+1, len(list_entity)):
+                                            behind_entity = list_entity[j]
+                                            b2 = behind_entity.wordOffset_begin
+                                            e2 = behind_entity.wordOffset_end
+                                            if _entity.sentence_index == behind_entity.sentence_index and behind_entity.entity_type in ['org', 'company'] \
+                                                    and b2-e<10 and re.search('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:', s[b2-e:b2]) or \
+                                                re.search('(联合体)|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)', s[e2:e2+10]):
+                                                join_l.append(behind_entity.entity_text)
+                                                b = b2
+                                                e = e2
+                                            else:
+                                                break
+                                        if len(join_l)>1:
+                                            d['win_tenderer_joint'] = ','.join(join_l)
+
+
+
+                                            # behind_entity = list_entity[i + 1]
+                                    # if _entity.sentence_index== behind_entity.sentence_index and _entity.entity_type in ['org', 'company'] and _entity.label==2\
+                                    #         and _entity.entity_text==winner and behind_entity.entity_type in ['org', 'company'] and behind_entity.label==5:
+                                    #     s = sentences[_entity.sentence_index].sentence_text
+                                    #     b = _entity.wordOffset_begin
+                                    #     e = _entity.wordOffset_end
+                                    #     b2 = behind_entity.wordOffset_begin
+                                    #     e2 = behind_entity.wordOffset_end
+                                        # if re.search('(联合体)', s[e2:e2+6]) and b2-e<3:
+                                        #     print('联合体:', s[max(0, b-10):e2+10])
+                                        #     d['win_tenderer_joint'] = '%s,%s'%(_entity.entity_text, behind_entity.entity_text)
+                                        #     break
+                                        # elif re.search('(联合体((牵头|主办)(人|方|单位)|主体)|牵头(人|方|单位))|(联合体)?成员:|特殊普通合伙:', s[e:b2]) and b2-e<10:
+                                        #     d['win_tenderer_joint'] = '%s,%s' % (_entity.entity_text, behind_entity.entity_text)
+                                        #     print('联合体:', s[max(0, b - 10):e2 + 10])
+                                        #     break
+    except Exception as e:
+        print('获取联合体抛出异常', e)
+
+
 if __name__=="__main__":
     '''
     conn = getConnection()

+ 44 - 37
BiddingKG/dl/interface/predictor.py

@@ -664,6 +664,9 @@ class PREMPredict():
                 elif re.search('尊敬的供应商:.{,25}我公司', text):
                     label = 0
                     values[label] = 0.801
+                elif re.search('尊敬的供应商:', text):
+                    label = 0
+                    values[label] = 0.501
             elif label == 1 and re.search('委托(单位|人|方)[是为:]+', text[:10]) and re.search('受委托(单位|人|方)[是为:]+', text[:10])==None:
                 label = 0
                 values[label] = 0.501
@@ -1120,22 +1123,22 @@ class RoleRulePredictor():
         self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)" \
                                      "(人|公司|单位|组织|用户|业主|主体|方|部门))" \
                                      "(是|为|:|:|\s*)+$)"
-        self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托|现将[\w()()]{5,20}[\d年月季度至]+采购意向))"
-        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向))"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
+        self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向))"
+        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束))"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
         self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
-        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
+        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
         self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
         self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|承租|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
                                         "(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
-                                        "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|服务|实施)(机构|单位|商|方)(名称)?[::是为]+$)"
-        self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w1>(,|。|^)((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)|第?[一1]名)(名称)?[,,]?([((]按综合排名排序[))])?[::,,]$)" #解决表头识别不到加逗号情况,需前面为,。空
+                                        "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|承保|承包|承接|服务|实施)(机构|单位|商|方)(名称)?[::是为]+$)"
+        self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w1>(,|。|^)((中标(投标)?|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)|第?[一1]名)(名称)?[,,]?([((]按综合排名排序[))])?[::,,]$)" #解决表头识别不到加逗号情况,需前面为,。空
         self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
         # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
         # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
         self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
-                                        "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^成为[\w、()()]+项目的成交供应商|^[((]中标人名称[))]))"
-        self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|(谈判结果:|确定)由.{5,20}(向我单位)?供货)|中标通知书.{,15}你方"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
+                                        "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^[作]?([\w、()()]+|本|此|该)项目的?(成交|中选|中标|服务)(供应商|单位|人)|^[((]中标人名称[))]))"
+        self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|决定由.{5,20}承办|(谈判结果:|确定)由.{5,20}(向我单位)?供货)|中标通知书.{,15}你方"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
 
         # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
 
@@ -1213,7 +1216,7 @@ class RoleRulePredictor():
                                                    word_flag=True, use_text=True,
                                                    text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text)))
                                 for _name in list_name:
-                                    if _name != "" and str(_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:
+                                    if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:  #加上前面一些信息,修复公司不在项目名称开头的,检测不到
                                         find_flag = True
                                         if p_entity.values[0] > on_value:
                                             p_entity.values[0] = 0.6 + (p_entity.values[0] - 0.6) / 10
@@ -1307,7 +1310,7 @@ class RoleRulePredictor():
                                                     _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
                                                     # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                     #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
-                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
+                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
                                                                                                         list_spans[0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
                                                         _flag = True
                                                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
@@ -1414,6 +1417,14 @@ class RoleRulePredictor():
 '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
 class RoleRuleFinalAdd():
     def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
+        '''
+        最终规则召回角色
+        :param list_articles:
+        :param list_sentences:
+        :param list_entitys:
+        :param list_codenames:
+        :return:
+        '''
         # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
         main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
         end_tokens = []
@@ -1422,11 +1433,12 @@ class RoleRuleFinalAdd():
         text_end = "".join(end_tokens[-30:])
         # print(text_end)
         # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
-        sear_ent = re.search('[,。;]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
-        sear_ent2 = re.search('(户名|开户名称)[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
-        sear_ent3 = re.search('(报名咨询|[收送交]货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
-        sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主)[,::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
-        sear_list = [sear_ent4 , sear_ent3 , sear_ent2 , sear_ent]
+        sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
+        sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
+        sear_ent2 = re.search('(户名|开户名称|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
+        sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点|)[,:](?P<entity>[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
+        sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|尊敬的供应商|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000])
+        sear_list = [sear_ent4 , sear_ent3 , sear_ent2 ,sear_ent1, sear_ent]
 
         tenderee_notfound = True
         agency_notfound = True
@@ -1434,32 +1446,21 @@ class RoleRuleFinalAdd():
         ents = []
         for ent in list_entitys[0]:
             if ent.entity_type in ['org', 'company']:
-                if ent.label == 0:
+                if ent.label == 0 and ent.values[ent.label]>=0.5:
+                    if '公共资源交易中心' in ent.entity_text:
+                        ent.label = 5
+                        continue
                     tenderee_list.append(ent.entity_text)
                     tenderee_notfound = False
                 elif ent.label == 1:
                     agency_notfound = False
                 elif ent.label == 5:
+                    if '公共资源交易中心' in ent.entity_text:
+                        continue
                     ents.append(ent)
-        if sear_ent or sear_ent2 or sear_ent3 or sear_ent4:
+        if sear_ent or sear_ent1 or sear_ent2 or sear_ent3 or sear_ent4:
             for _sear_ent in [_sear for _sear in sear_list if _sear]:
-                # if sear_ent4:
-                #     ent_re = sear_ent4.group(2)
-                # elif sear_ent3:
-                #     ent_re = sear_ent3.group(2)
-                # elif sear_ent2:
-                #     ent_re = sear_ent2.group(2)
-                # else:
-                #     ent_re = sear_ent.group(1)
-                if _sear_ent==sear_ent4:
-                    ent_re = _sear_ent.group(2)
-                elif _sear_ent==sear_ent3:
-                    ent_re = _sear_ent.group(2)
-                elif _sear_ent==sear_ent2:
-                    ent_re = _sear_ent.group(2)
-                else:
-                    ent_re = _sear_ent.group(1)
-                # print('ent_re', ent_re)
+                ent_re = _sear_ent.group('entity')
                 ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
 
                 if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
@@ -1489,8 +1490,8 @@ class RoleRuleFinalAdd():
                             agency_notfound = False
                             # log('正则最后补充实体: %s'%(ent_re))
                             break
-                if not tenderee_notfound:
-                    break
+                    if not tenderee_notfound:
+                        break
 
         elif list_codenames[0]['name'] != "":  #把标题包含的公司实体作为招标人
             # tenderee_notfound = True
@@ -1509,6 +1510,7 @@ class RoleRuleFinalAdd():
                     if ent.entity_text in list_codenames[0]['name']:
                         ent.label = 0
                         ent.values[0] = 0.5
+                        tenderee_notfound == False
                         # log('正则召回标题中包含的实体:%s'%ent.entity_text)
                         break
 
@@ -3045,6 +3047,7 @@ class DocChannel():
           6、预测及原始均在变更、答疑,返回原始类别
           7、预测为采招数据,原始为产权且有关键词,返回原始类别
           8、废标公告原始为招标、预告且标题无废标关键期,返回原始类别
+          9、若预测为非采招数据且源网为采招数据且标题无关键词返回采招数据
           '''
           if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
                   original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(prem_json)==False:
@@ -3080,6 +3083,10 @@ class DocChannel():
                   self.title_life_dic['废标公告'], title) == None:
               result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
               msc += '最终规则修改:废标公告原始为招标、预告且标题无废标关键期,返回原始类别;'
+          elif result['docchannel']['doctype'] != '采招数据' and origin_dic.get(
+                  original_docchannel, '') not in ['产权交易', '土地矿产', '拍卖出让'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产|挂牌|出让|拍卖|招拍|划拨', title)==None:
+              result['docchannel']['doctype'] = '采招数据'
+              msc += '最终规则修改:预测为非采招数据,原始为采招数据且无关键词,返回采招数据'
 
           '''下面是新格式增加返回字段'''
           if result['docchannel']['docchannel'] != '':  # 预测到生命周期的复制到life_docchannel,否则用数据源结果
@@ -3151,7 +3158,7 @@ class DocChannel():
               type_id, type_prob = type_model_predict()
               type_model = self.id2type[type_id]
               result['docchannel']['doctype'] = type_model
-              msc += type_model + ';'
+              msc += type_model + ' 概率:%.4f;'%type_prob
               # print('公告类别:', self.id2type[id], '概率:',prob)
               # if id == 0:
           if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']:
@@ -3159,7 +3166,7 @@ class DocChannel():
                   life_id, life_prob = life_model_predict()
                   life_model = self.id2life[life_id]
                   result['docchannel']['docchannel'] = life_model
-                  msc += life_model + ';\n'
+                  msc += life_model + ' 概率:%.4f;\n'%life_prob
 
       msc = final_change(msc)
       # print('channel ', msc)