Bladeren bron

调整channel 规则,优化提取

lsm 2 jaren geleden
bovenliggende
commit
97450aea34
1 gewijzigde bestanden met toevoegingen van 15 en 12 verwijderingen
  1. 15 12
      BiddingKG/dl/interface/predictor.py

+ 15 - 12
BiddingKG/dl/interface/predictor.py

@@ -2521,7 +2521,7 @@ class DocChannel():
           '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
           '招标预告': '(预计|计划)(采购|招标)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
           '招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|\w{,5}材料)|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格要求|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
-          '资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过(原因|资格)',
+          '资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过原因', #|资格
           '招标答疑': '现澄清(为|如下)|答疑补遗|澄清内容如下|第[0-9一二三四五]次澄清|答疑澄清|(最高(投标)?限价|控制价|拦标价)公示',  # |异议的回复
           '公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)',
           '候选人公示': '候选人公示|评标结果公示',
@@ -2891,12 +2891,9 @@ class DocChannel():
                       life_kw_content[k2]['pos'].append(it.group(0))
                   else:
                       life_kw_content[k2]['neg'].append(it.group(0))
-              if k2 not in life_score:
-                  life_score[k2] = count_score(life_kw_content[k2]['pos']) if 'neg' not in k else -count_score(
-                      life_kw_content[k2]['neg'])
-              else:
-                  life_score[k2] = life_score[k2] + count_score(life_kw_content[k2]['pos']) if 'neg' not in k else \
-                  life_score[k2] - count_score(life_kw_content[k2]['neg'])
+          for k2 in life_kw_content:
+              life_score[k2] = count_score(life_kw_content[k2]['pos']) - count_score(
+                  life_kw_content[k2]['neg'])
 
           life_kw_title = {k: v for k, v in life_kw_title.items() if v != []}
           life_kw_content = {k: v for k, v in life_kw_content.items() if life_score[k] > 0}
@@ -2950,13 +2947,13 @@ class DocChannel():
                   return '招标公告', msc
               elif '废标公告' in life_kw_title:
                   return '废标公告', msc
-              elif life_score.get('候选人公示', 0) > 3:
+              elif life_score.get('候选人公示', 0) >= 3:
                   return '候选人公示', msc
               elif life_score.get('合同公告', 0) > 5:
                   return '合同公告', msc
               return '中标信息', msc
           elif '废标公告' in life_kw_title or '废标公告' in life_list:
-              if life_score.get('招标公告', 0) > 3:
+              if life_score.get('招标公告', 0) > 3 and '废标公告' not in life_kw_title:
                   return '招标公告', msc
               return '废标公告', msc
           elif '资审结果' in life_kw_title or '资审结果' in life_list:
@@ -3016,9 +3013,10 @@ class DocChannel():
           5、预测及原始均在招标、预告、意向,返回原始类别
           6、预测及原始均在变更、答疑,返回原始类别
           7、预测为采招数据,原始为产权且有关键词,返回原始类别
+          8、废标公告原始为招标、预告且标题无废标关键期,返回原始类别
           '''
           if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
-                  original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
+                  original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(prem_json)==False:
               result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
               msc += '最终规则修改:中标公告、合同公告无中标人且原始为非中标,返回原类型'
           elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
@@ -3028,9 +3026,9 @@ class DocChannel():
           elif result['docchannel']['docchannel'] in ['招标答疑'] and re.search(
                   self.title_life_dic['招标答疑'], title) == None and origin_dic.get(
                   original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
-              result['docchannel']['docchannel'] = '中标信息'
+              result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
               msc += '最终规则修改:答疑公告标题无答疑关键且原始为招标,返回原始类别;'
-          elif result['docchannel']['docchannel'] == '中标信息' and is_contain_winner(prem_json) and origin_dic.get(
+          elif result['docchannel']['docchannel'] == '招标公告' and is_contain_winner(prem_json) and origin_dic.get(
                   original_docchannel, '') == '中标信息':
               result['docchannel']['docchannel'] = '中标信息'
               msc += '最终规则修改:预测为招标公告却有中标人且原始为中标改为中标信息;'
@@ -3046,6 +3044,11 @@ class DocChannel():
                   original_docchannel, '') in ['产权交易', '土地矿产'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产', text):
               result['docchannel']['doctype'] = origin_dic.get(original_docchannel, '')
               msc += '最终规则修改:预测为采招数据,原始为产权且有关键词,返回原始类别'
+          elif result['docchannel']['docchannel'] == '废标公告' and origin_dic.get(
+                  original_docchannel, '') in ['招标公告', '采购意向', '招标预告'] and re.search(
+                  self.title_life_dic['废标公告'], title) == None:
+              result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
+              msc += '最终规则修改:废标公告原始为招标、预告且标题无废标关键期,返回原始类别;'
 
           '''下面是新格式增加返回字段'''
           if result['docchannel']['docchannel'] != '':  # 预测到生命周期的复制到life_docchannel,否则用数据源结果