Ver Fonte

新增正则提取channel,正则模型联合预测channel类型

lsm há 3 anos atrás
pai
commit
d245a537a4

+ 123 - 118
BiddingKG/dl/interface/Preprocessing.py

@@ -1657,125 +1657,129 @@ def get_preprocessed(articles, useselffool=False):
     return list_articles,list_sentences,list_entitys,list_outlines,cost_time
 
 def special_treatment(sourceContent, web_source_no):
-    if web_source_no == 'DX000202-1':
-         ser = re.search('中标供应商及中标金额:【((\w{5,20}-[\d,.]+,)+)】', sourceContent)
-         if ser:
-             new = ""
-             l = ser.group(1).split(',')
-             for i in range(len(l)):
-                 it = l[i]
-                 if '-' in it:
-                     role, money = it.split('-')
-                     new += '标段%d, 中标供应商: ' % (i + 1) + role + ',中标金额:' + money + '。'
-             sourceContent = sourceContent.replace(ser.group(0), new, 1)
-    elif web_source_no == '00753-14':
-        body = sourceContent.find("body")
-        body_child = body.find_all(recursive=False)
-        pcontent = body
-        if 'id' in body_child[0].attrs:
-            if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
-                pcontent = body_child[0]
-        # pcontent = sourceContent.find("div", id="pcontent")
-        pcontent = pcontent.find_all(recursive=False)[0]
-        first_table = None
-        for idx in range(len(pcontent.find_all(recursive=False))):
-            t_part = pcontent.find_all(recursive=False)[idx]
-            if t_part.name != "table":
-                break
-            if idx == 0:
-                first_table = t_part
-            else:
-                for _tr in t_part.find("tbody").find_all(recursive=False):
-                    first_table.find("tbody").append(_tr)
-                t_part.clear()
-    elif web_source_no == 'DX008357-11':
-        body = sourceContent.find("body")
-        body_child = body.find_all(recursive=False)
-        pcontent = body
-        if 'id' in body_child[0].attrs:
-            if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
-                pcontent = body_child[0]
-        # pcontent = sourceContent.find("div", id="pcontent")
-        pcontent = pcontent.find_all(recursive=False)[0]
-        error_table = []
-        is_error_table = False
-        for part in pcontent.find_all(recursive=False):
-            if is_error_table:
-                if part.name == "table":
-                    error_table.append(part)
-                else:
+    try:
+        if web_source_no == 'DX000202-1':
+             ser = re.search('中标供应商及中标金额:【((\w{5,20}-[\d,.]+,)+)】', sourceContent)
+             if ser:
+                 new = ""
+                 l = ser.group(1).split(',')
+                 for i in range(len(l)):
+                     it = l[i]
+                     if '-' in it:
+                         role, money = it.split('-')
+                         new += '标段%d, 中标供应商: ' % (i + 1) + role + ',中标金额:' + money + '。'
+                 sourceContent = sourceContent.replace(ser.group(0), new, 1)
+        elif web_source_no == '00753-14':
+            body = sourceContent.find("body")
+            body_child = body.find_all(recursive=False)
+            pcontent = body
+            if 'id' in body_child[0].attrs:
+                if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
+                    pcontent = body_child[0]
+            # pcontent = sourceContent.find("div", id="pcontent")
+            pcontent = pcontent.find_all(recursive=False)[0]
+            first_table = None
+            for idx in range(len(pcontent.find_all(recursive=False))):
+                t_part = pcontent.find_all(recursive=False)[idx]
+                if t_part.name != "table":
                     break
-            if part.name == "div" and part.get_text(strip=True) == "中标候选单位:":
-                is_error_table = True
-        first_table = None
-        for idx in range(len(error_table)):
-            t_part = error_table[idx]
-            # if t_part.name != "table":
-            #     break
-            if idx == 0:
-                for _tr in t_part.find("tbody").find_all(recursive=False):
-                    if _tr.get_text(strip=True) == "":
-                        _tr.decompose()
-                first_table = t_part
-            else:
-                for _tr in t_part.find("tbody").find_all(recursive=False):
-                    if _tr.get_text(strip=True) != "":
+                if idx == 0:
+                    first_table = t_part
+                else:
+                    for _tr in t_part.find("tbody").find_all(recursive=False):
                         first_table.find("tbody").append(_tr)
-                t_part.clear()
-    elif web_source_no == '18021-2':
-        body = sourceContent.find("body")
-        body_child = body.find_all(recursive=False)
-        pcontent = body
-        if 'id' in body_child[0].attrs:
-            if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
-                pcontent = body_child[0]
-        # pcontent = sourceContent.find("div", id="pcontent")
-        td = pcontent.find_all("td")
-        for _td in td:
-            if str(_td.string).strip() == "报价金额":
-                _td.string = "单价"
-    elif web_source_no == '13740-2':
-        # “xxx成为成交供应商”
-        re_match = re.search("[^,。]+成为[^,。]*成交供应商", sourceContent)
-        if re_match:
-            sourceContent = sourceContent.replace(re_match.group(), "成交人:" + re_match.group(), sourceContent)
-    elif web_source_no == '03786-10':
-        ser1 = re.search('中标价:([\d,.]+)', sourceContent)
-        ser2 = re.search('合同金额[((]万元[))]:([\d,.]+)', sourceContent)
-        if ser1 and ser2:
-            m1 = ser1.group(1).replace(',', '')
-            m2 = ser2.group(1).replace(',', '')
-            if float(m1) < 100000 and (m1.split('.')[0] == m2.split('.')[0] or m2 == '0'):
-                new = '中标价(万元):' + m1
-                sourceContent = sourceContent.replace(ser1.group(0), new, 1)
-    elif web_source_no=='00076-4':
-        ser = re.search('主要标的数量:([0-9一]+)\w{,3},主要标的单价:([\d,.]+)元?,合同金额:(.00),', sourceContent)
-        if ser:
-            num = ser.group(1).replace('一', '1')
-            try:
-                num = 1 if num == '0' else num
-                unit_price = ser.group(2).replace(',', '')
-                total_price = str(int(num) * float(unit_price))
-                new = '合同金额:' + total_price
-                sourceContent = sourceContent.replace('合同金额:.00', new, 1)
-            except Exception as e:
-                log('preprocessing.py special_treatment exception')
-    elif web_source_no=='DX000105-2':
-        if re.search("成交公示", sourceContent) and re.search(',投标人:', sourceContent) and re.search(',成交人:', sourceContent)==None:
-            sourceContent = sourceContent.replace(',投标人:', ',成交人:')
-    elif web_source_no in ['04080-3', '04080-4']:
-        ser = re.search('合同金额:([0-9,]+.[0-9]{3,})(.{,4})', sourceContent)
-        if ser and '万' not in ser.group(2):
-            sourceContent = sourceContent.replace('合同金额:', '合同金额(万元):')
-    elif web_source_no=='03761-3':
-        ser = re.search('中标价,([0-9]+)[.0-9]*%', sourceContent)
-        if ser and int(ser.group(1))>100:
-            sourceContent = sourceContent.replace(ser.group(0), ser.group(0)[:-1]+'元')
-    elif web_source_no=='00695-7':
-        ser = re.search('支付金额:', sourceContent)
-        if ser:
-            sourceContent = sourceContent.replace('支付金额:', '合同金额:')
-    return sourceContent
+                    t_part.clear()
+        elif web_source_no == 'DX008357-11':
+            body = sourceContent.find("body")
+            body_child = body.find_all(recursive=False)
+            pcontent = body
+            if 'id' in body_child[0].attrs:
+                if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
+                    pcontent = body_child[0]
+            # pcontent = sourceContent.find("div", id="pcontent")
+            pcontent = pcontent.find_all(recursive=False)[0]
+            error_table = []
+            is_error_table = False
+            for part in pcontent.find_all(recursive=False):
+                if is_error_table:
+                    if part.name == "table":
+                        error_table.append(part)
+                    else:
+                        break
+                if part.name == "div" and part.get_text(strip=True) == "中标候选单位:":
+                    is_error_table = True
+            first_table = None
+            for idx in range(len(error_table)):
+                t_part = error_table[idx]
+                # if t_part.name != "table":
+                #     break
+                if idx == 0:
+                    for _tr in t_part.find("tbody").find_all(recursive=False):
+                        if _tr.get_text(strip=True) == "":
+                            _tr.decompose()
+                    first_table = t_part
+                else:
+                    for _tr in t_part.find("tbody").find_all(recursive=False):
+                        if _tr.get_text(strip=True) != "":
+                            first_table.find("tbody").append(_tr)
+                    t_part.clear()
+        elif web_source_no == '18021-2':
+            body = sourceContent.find("body")
+            body_child = body.find_all(recursive=False)
+            pcontent = body
+            if 'id' in body_child[0].attrs:
+                if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
+                    pcontent = body_child[0]
+            # pcontent = sourceContent.find("div", id="pcontent")
+            td = pcontent.find_all("td")
+            for _td in td:
+                if str(_td.string).strip() == "报价金额":
+                    _td.string = "单价"
+        elif web_source_no == '13740-2':
+            # “xxx成为成交供应商”
+            re_match = re.search("[^,。]+成为[^,。]*成交供应商", sourceContent)
+            if re_match:
+                sourceContent = sourceContent.replace(re_match.group(), "成交人:" + re_match.group())
+        elif web_source_no == '03786-10':
+            ser1 = re.search('中标价:([\d,.]+)', sourceContent)
+            ser2 = re.search('合同金额[((]万元[))]:([\d,.]+)', sourceContent)
+            if ser1 and ser2:
+                m1 = ser1.group(1).replace(',', '')
+                m2 = ser2.group(1).replace(',', '')
+                if float(m1) < 100000 and (m1.split('.')[0] == m2.split('.')[0] or m2 == '0'):
+                    new = '中标价(万元):' + m1
+                    sourceContent = sourceContent.replace(ser1.group(0), new, 1)
+        elif web_source_no=='00076-4':
+            ser = re.search('主要标的数量:([0-9一]+)\w{,3},主要标的单价:([\d,.]+)元?,合同金额:(.00),', sourceContent)
+            if ser:
+                num = ser.group(1).replace('一', '1')
+                try:
+                    num = 1 if num == '0' else num
+                    unit_price = ser.group(2).replace(',', '')
+                    total_price = str(int(num) * float(unit_price))
+                    new = '合同金额:' + total_price
+                    sourceContent = sourceContent.replace('合同金额:.00', new, 1)
+                except Exception as e:
+                    log('preprocessing.py special_treatment exception')
+        elif web_source_no=='DX000105-2':
+            if re.search("成交公示", sourceContent) and re.search(',投标人:', sourceContent) and re.search(',成交人:', sourceContent)==None:
+                sourceContent = sourceContent.replace(',投标人:', ',成交人:')
+        elif web_source_no in ['04080-3', '04080-4']:
+            ser = re.search('合同金额:([0-9,]+.[0-9]{3,})(.{,4})', sourceContent)
+            if ser and '万' not in ser.group(2):
+                sourceContent = sourceContent.replace('合同金额:', '合同金额(万元):')
+        elif web_source_no=='03761-3':
+            ser = re.search('中标价,([0-9]+)[.0-9]*%', sourceContent)
+            if ser and int(ser.group(1))>100:
+                sourceContent = sourceContent.replace(ser.group(0), ser.group(0)[:-1]+'元')
+        elif web_source_no=='00695-7':
+            ser = re.search('支付金额:', sourceContent)
+            if ser:
+                sourceContent = sourceContent.replace('支付金额:', '合同金额:')
+        return sourceContent
+    except Exception as e:
+        log('特殊数据源: %s 预处理特别修改抛出异常: %s'%(web_source_no, e))
+        return sourceContent
 
 def article_limit(soup,limit_words=30000):
     sub_space = re.compile("\s+")
@@ -2348,7 +2352,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                             if k.split("_")[0]=="money":
                                 entity_text = v
                             if k.split("_")[0]=="unit":
-                                unit = v
+                                if v=='万元' or unit=="":  # 处理  预算金额(元):160万元 这种出现前后单位不一致情况
+                                    unit = v
                             if k.split("_")[0]=="text":
                                 text_beforeMoney = v
                             if k.split("_")[0]=="filter":

+ 34 - 24
BiddingKG/dl/interface/extract.py

@@ -116,11 +116,11 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     cost_time["preprocess"] = round(time.time()-start_time,2)
     cost_time.update(_cost_time)
 
-    #依赖句子顺序
-    start_time = time.time() # 公告类型/生命周期提取
-    channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0],
-                                                            web_source_no=web_source_no,original_docchannel=original_docchannel)
-    cost_time["channel"] = round(time.time()-start_time,2)
+    # #依赖句子顺序
+    # start_time = time.time() # 公告类型/生命周期提取  此处作废 换到后面预测 2022/4/29
+    # channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0],
+    #                                                         web_source_no=web_source_no,original_docchannel=original_docchannel)
+    # cost_time["channel"] = round(time.time()-start_time,2)
 
     start_time = time.time() # 项目编号、名称提取
     codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
@@ -132,12 +132,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     log("get prem done of doc_id%s"%(doc_id))
     cost_time["prem"] = round(time.time()-start_time,2)
 
-    start_time = time.time() # 产品名称及废标原因提取
-    fail = channel_dic['docchannel']['docchannel'] == "废标公告"
-    fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
-    # predictor.getPredictor("product").predict(list_sentences, list_entitys)
-    log("get product done of doc_id%s"%(doc_id))
-    cost_time["product"] = round(time.time()-start_time,2)
+    # start_time = time.time() # 产品名称及废标原因提取  此处作废 换到后面预测 2022/4/29
+    # fail = channel_dic['docchannel']['docchannel'] == "废标公告"
+    # fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
+    # # predictor.getPredictor("product").predict(list_sentences, list_entitys)
+    # log("get product done of doc_id%s"%(doc_id))
+    # cost_time["product"] = round(time.time()-start_time,2)
 
     start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间
     product_attrs, total_product_money = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
@@ -178,7 +178,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
             for _entity in list_entity:
                 # print('keyword:',keyword, '_entity.notes :',_entity.notes)
                 if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label==2:
-                    if channel_dic['docchannel'] == "招标公告":
+                    # if channel_dic['docchannel'] == "招标公告":
+                    if re.search('中标|成交|中选|中价|中租||结果|入围', title+list_articles[0].content[:100])==None:
                         _entity.values[0] = 0.51
                         _entity.set_Money(0, _entity.values)  #2021/11/18 根据公告类别把费用改为招标或中投标金额
                     else:
@@ -203,15 +204,6 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
     cost_time["punish"] = round(time.time()-start_time,2)
 
-    '''公告无表格格式时,采购意向预测'''
-    if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0:
-        product_attrs = predictor.getPredictor("product_attrs").predict_without_table(product_attrs, list_sentences,
-                                                                                      list_entitys,codeName,prem,text,page_time)
-    if len(product_attrs[1]['demand_info']['data'])>0:
-        for d in product_attrs[1]['demand_info']['data']:
-            for product in set(prem[0]['product']):
-                if product in d['project_name'] and product not in d['product']:
-                    d['product'].append(product)  #把产品在项目名称中的添加进需求要素中
 
     '''修正采购公告表格形式多种采购产品中标价格'''
     if total_product_money>0 and len(prem[0]['prem'])==1:
@@ -227,12 +219,30 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
                 except Exception as e:
                     log('表格产品价格修正中标价格报错:%s'%e)
 
-    '''修正channel预测类别为招标公告却有中标人及预测为中标信息却无中标关键词的类别'''
+    '''修正channel预测类别为招标公告却有中标人及预测为中标信息却无中标关键词的类别''' # 依赖 prem
     start_time = time.time()
-    content = list_articles[0].content
-    channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
+    # content = list_articles[0].content
+    # channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
+    channel_dic = predictor.getPredictor("channel").predict_merge(title,list_sentences[0], text, list_articles[0].bidway, prem[0], original_docchannel)
     cost_time["rule_channel"] = round(time.time()-start_time,2)
 
+    start_time = time.time() # 产品名称及废标原因提取  #依赖 docchannel结果
+    fail = channel_dic['docchannel']['docchannel'] == "废标公告"
+    fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
+    # predictor.getPredictor("product").predict(list_sentences, list_entitys)
+    log("get product done of doc_id%s"%(doc_id))
+    cost_time["product"] = round(time.time()-start_time,2)
+
+    '''公告无表格格式时,采购意向预测'''  #依赖 docchannel结果
+    if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0:
+        product_attrs = predictor.getPredictor("product_attrs").predict_without_table(product_attrs, list_sentences,
+                                                                                      list_entitys,codeName,prem,text,page_time)
+    if len(product_attrs[1]['demand_info']['data'])>0:
+        for d in product_attrs[1]['demand_info']['data']:
+            for product in set(prem[0]['product']):
+                if product in d['project_name'] and product not in d['product']:
+                    d['product'].append(product)  #把产品在项目名称中的添加进需求要素中
+
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason)

+ 326 - 4
BiddingKG/dl/interface/predictor.py

@@ -1281,7 +1281,7 @@ class RoleRulePredictor():
                                                     _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
                                                     # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                     #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
-                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|交易服务单位',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
+                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
                                                                                                         list_spans[0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
                                                         _flag = True
                                                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
@@ -2439,13 +2439,60 @@ class DocChannel():
     self.id2type = {k: v for k, v in enumerate(lb_type)}
     self.id2life = {k: v for k, v in enumerate(lb_life)}
 
+    self.load_pattern()
+
+  def load_pattern(self):
+      self.type_dic = {
+            '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
+            '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会',
+            '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租|买受)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
+            '采招数据': '(采购|招标|代理)(人|机构|单位)|(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;' #|变更|答疑|澄清|中标|成交|合同|废标|流标
+        }
+
+      self.title_type_dic = {
+            '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地',
+            '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍',
+            '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让',
+            '采招数据': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)',  # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
+            '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)'
+        }
+      self.life_dic = {
+            '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
+            '招标预告': '预计(采购|招标)(时间|日期)',
+            '招标公告': '(采购|招标|竞选|报名)条件;报名时间;报名流程;报名方法;报名需提供的材料;参加竞价采购交易资格;(申请人|投标人|供应商|报价人|参选人)的?资格要求;获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件;(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
+            '资审结果': '招标资审公告|评审入围公示|资审及业绩公示|资格后审情况报告|资格后审结果公告|资格后审结果公示|资格预审结果公告|资格预审结果公示|预审公示|预审结果公示',
+            '招标答疑': '现澄清为|答疑澄清公告|异议的回复|(最高(投标)?限价|控制价|拦标价)公示',
+            '公告变更': '原公告(主要)?(信息|内容)|变更[前后]内容|现在?(变更|更正|修改|更改)为|(变更|更正)内容为|更正理由|更正人名称|[、\s](更正信息|更正内容):',
+            '候选人公示': '候选人公示|评标结果公示',
+            '中标信息': '供地结果信息|采用单源直接采购的?情况说明|现将\w{,4}(成交|中标|中选|选定结果|选取结果)\w{2,8}(进行公示|公示如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|(中标(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
+            '中标信息2': '(成交|中标)(日期|时间)[::\s]|成交金额:',
+            '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
+            '合同公告': '合同(公告|公示)信息;合同(公告|公示)日期;合同(公告|公示)内容;合同编号;合同名称;合同签订日期;合同主体;供应商乙方',
+            '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):废标|((本|该)项目|本标段|本次(招标)?)((采购|招标)?(失败|终止|流标|废标)|(按|做|作)(流标|废标)处理)',
+        }
+      self.title_life_dic = {
+            '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
+            '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|供应计划$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
+            '公告变更': '(变更|更正(事项)?|更改|延期|暂停)的?(公告|公示|通知)|变更$|更正$',
+            '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)公示',
+            '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销|取消成交)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)',
+            '合同公告': '(合同(成交)?|履约验收|履约|验收结果)(公告|公示|信息|公式)|合同备案|合同书',  # 合同$|
+            '候选人公示': '候选人公示|评标(结果)?公示|中标前?公示|中标预公示',
+            '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)结果|开标(记录|信息|情况)|中标通知书|中标$',
+            # '资审结果': '(资质|资格)(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)(审查|预审)结果(公示)?|资审结果公示|未?入围(公示|公告)|资审及业绩公示',
+            '资审结果': '((资格|资质)(审查|预审|后审|审核|入围项?目?)|资审|入围)结果(公告|公示)?|(资质|资格)(预审|后审|入围)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|未?入围(公示|公告)|资审及业绩公示',
+            '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)',
+        }
+
+      self.wrong_win = '按项目控制价下浮\d%即为成交价|不得确定为(中标|成交)|招标人按下列原则选择中标人|确定成交供应商:|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)|竞拍起止时间:'
+
   def load_life(self,life_model):
     with tf.Graph().as_default() as graph:
       output_graph_def = graph.as_graph_def()
       with open(os.path.dirname(__file__)+life_model, 'rb') as f:
         output_graph_def.ParseFromString(f.read())
         tf.import_graph_def(output_graph_def, name='')
-        print("%d ops in the final graph" % len(output_graph_def.node))
+        # print("%d ops in the final graph" % len(output_graph_def.node))
         del output_graph_def
         sess = tf.Session(graph=graph)
         sess.run(tf.global_variables_initializer())
@@ -2464,7 +2511,7 @@ class DocChannel():
       with open(os.path.dirname(__file__)+type_model, 'rb') as f:
         output_graph_def.ParseFromString(f.read())
         tf.import_graph_def(output_graph_def, name='')
-        print("%d ops in the final graph" % len(output_graph_def.node))
+        # print("%d ops in the final graph" % len(output_graph_def.node))
         del output_graph_def
         sess = tf.Session(graph=graph)
         sess.run(tf.global_variables_initializer())
@@ -2611,7 +2658,7 @@ class DocChannel():
       id = np.argmax(pred, axis=1)[0]
       prob = pred[0][id]
       result['docchannel']['docchannel'] = self.id2life[id]
-      # print('生命周期:',self.id2life[id], '概率:',prob)
+      # print('生命周期:纯模型预测',self.id2life[id], '概率:',prob)
       # if id == 6:
       if result['docchannel']['docchannel'] == '中标信息':
         if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
@@ -2666,6 +2713,281 @@ class DocChannel():
               log('正则把中标信息修改为空')
       return channel_dic
 
+  def predict_merge(self, title, list_sentence, html, bidway, prem, original_docchannel='', web_source_no=''):
+      '''
+      正则,模型混合预测,返回公告类型及生命周期
+      :param title:  公告标题
+      :param content: 预处理后的返回的句子实体列表 list_sentence
+      :param html: 公告原文 html 内容
+      :param bidway: 招标方式
+      :param prem: 提取的prem 字典
+      :return: {'docchannel': {'docchannel':'中标信息', 'doctype':'采招数据'}} 字典格式
+      '''
+      def cut_single_cn_space(text):
+          new_text = ""
+          for w in text.split():
+              if len(w) == 1 or re.search('^[\u4e00-\u9fa5][::]', w):
+                  new_text += w
+              else:
+                  new_text += ' ' + w
+          return new_text
+
+      def html2text(html):
+          ser = re.search('<div[^<>]*richTextFetch', html)
+          if ser:
+              html = html[:ser.start()]+'##richTextFetch##'
+          text = re.sub('<[^<]*?>', '', html).replace('&nbsp;', ' ')
+          text = re.sub('\s+', ' ', text)
+          text = re.sub('[/|[()()]', '', text)
+          text = cut_single_cn_space(text)
+          return text[:20000]
+
+      def count_diffser(pattern, text):
+          num = 0
+          kw = []
+          for p in pattern.split(';'):
+              if re.search(p, text):
+                  num += 1
+                  kw.append(re.search(p, text).group(0))
+          return num, ';'.join(kw)
+
+      def is_contain_winner(extract_json):
+          if re.search('win_tenderer', extract_json):
+              return True
+          else:
+              return False
+
+      def is_single_source(bidway, title):
+          if re.search('单一来源|单一性采购', title):
+              return True
+          elif bidway == '单一来源':
+              return True
+          else:
+              return False
+
+      def get_type(title, text):
+          if re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'],
+                                                                   text):  # and re.search('(土地|用地|宗地|地块)(经营权)?(流转|承包|出租|招租|租赁|确权)', text)==None
+              if re.search(self.title_type_dic['采招数据'], title + text[:50]):
+                  return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
+              return '土地矿产', (re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'], text)).group(0)
+          elif (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
+              if re.search(self.title_type_dic['采招数据'], title + text[:50]):
+                  return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
+              return '拍卖出让', (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)).group(0)
+          elif re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text):
+              if re.search(self.title_type_dic['采招数据'], title + text[:50]):
+                  return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
+              return '产权交易', (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)).group(0)
+          elif re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text):
+              return '采招数据', (
+                          re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text)).group(
+                  0)
+          elif re.search(self.title_type_dic['新闻资讯'], title):
+              if re.search(self.title_type_dic['采招数据'], title + text[:150]):
+                  return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:150]).group(0)
+              return '新闻资讯', re.search(self.title_type_dic['新闻资讯'], title).group(0)
+          else:
+              return '', '没有公告类型关键词,返回空'
+
+      def get_life(title, text, extract_json="", bidway="",  original_docchannel=''):
+          if re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100]):
+              if re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
+                  return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(
+                      0)
+              elif re.search(self.title_life_dic['候选人公示'], title):
+                  return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
+              elif re.search(self.title_life_dic['中标信息'], title):
+                  return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
+              elif re.search('终止|废标|流标', title):
+                  return '废标公告', re.search('终止|废标|流标', title).group(0)
+              elif is_single_source(bidway, title):
+                  return '中标信息', 'bidway单一来源'
+              return '采购意向', (
+                          re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100])).group(0)
+          elif re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text):
+              if re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
+                  return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(
+                      0)
+              elif re.search(self.title_life_dic['候选人公示'], title):
+                  return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
+              elif re.search(self.title_life_dic['中标信息'], title):
+                  return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
+              elif re.search('终止|废标|流标', title):
+                  return '废标公告', re.search('终止|废标|流标', title).group(0)
+              elif is_single_source(extract_json, title):
+                  return '中标信息', 'bidway单一来源'
+              return '招标预告', (re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text)).group(0)
+          elif re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
+              if re.search(self.title_life_dic['废标公告'], title):
+                  return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0)
+              #         elif re.search('(中标|成交)结果', title[-8:]):
+              #             return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0)       
+              return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(0)
+          elif re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text) or len(
+                  re.findall('(答:|回复:)', text)) >= 2:  # or re.search(self.title_life_dic['招标答疑'], text[:150])
+              if re.search(self.title_life_dic['废标公告'], title):
+                  return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0)
+              elif re.search('(中标|成交)结果', title[-8:]):
+                  return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0)
+              return '招标答疑', (
+                          re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text) or re.search(
+                      '(答:|回复:)', text)).group(0)
+          elif re.search(self.title_life_dic['废标公告'], title+ text[:150]) or re.search(self.life_dic['废标公告'], text[:150]):
+              return '废标公告', (
+                          re.search(self.title_life_dic['废标公告'], title+ text[:150]) or re.search(self.life_dic['废标公告'], text[:150])).group(0)
+          elif re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150]):
+              if re.search('候选人|公示期?(已?满|已经?结束)|中标(结果|公告)', text) == None:
+                  return '中标信息', '候选人公示排除,修改为中标信息'
+              return '候选人公示', (
+                          re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150])).group(
+                  0)
+          elif re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'], text[
+                                                                                             :150]):
+              return '合同公告', (re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'],
+                                                                                    text[:150]) or re.search(
+                  self.life_dic['合同公告'], text)).group(0)
+          elif re.search(self.life_dic['合同公告'].replace(';', '|'), text):  # or re.search(self.life_dic['合同公告'], text[:300]):
+              num, kw = count_diffser(self.life_dic['合同公告'], text)
+              if num >= 3:
+                  return '合同公告', kw
+              elif re.search(self.title_life_dic['招标公告'], title[-8:]):
+                  return '招标公告', re.search(self.title_life_dic['招标公告'], title[-8:]).group(0)
+              elif not is_contain_winner(extract_json):
+                  return '', '有合同关键词无中标角色返回空'
+              return '合同公告', re.search(self.life_dic['合同公告'].replace(';', '|'), text).group(0)
+          elif is_single_source(extract_json, title):
+              return '中标信息', '单一来源采购'
+          elif re.search(self.title_life_dic['中标信息'], title):
+              if re.search(self.title_life_dic['资审结果'], title+text[:150]):
+                  return '资审结果', re.search(self.title_life_dic['资审结果'], title+text[:150]).group(0)
+              return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
+          elif re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:]):
+              if re.search(self.title_life_dic['资审结果'], title+text[:150]):
+                  return '资审结果', re.search(self.title_life_dic['资审结果'], title+text[:150]).group(0)
+              # if re.search(self.wrong_win, text):
+              #     return '招标公告', re.search(self.wrong_win, text).group(0)
+              return '中标信息', (
+                          re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:])).group(
+                  0)
+          elif re.search(self.life_dic['中标信息2'], text[:]):
+              if re.search(self.wrong_win, text):
+                  return '招标公告', re.search(self.wrong_win, text).group(0)
+              return '中标信息', re.search(self.life_dic['中标信息2'], text[:]).group(0)
+          elif re.search(self.life_dic['中标信息3'], text[:]) and is_contain_winner(extract_json):
+              if re.search(self.wrong_win, text):
+                  return '招标公告', re.search(self.wrong_win, text).group(0)
+              return '中标信息', re.search(self.life_dic['中标信息3'], text[:]).group(0)
+          elif re.search('公开选取.{,20}机构的公告', title):
+              if re.search('(中标|成交|中选)(中介|服务)?机构(名称)?[::\s]', text):
+                  return '中标信息', '机构选取有中选机构'
+              else:
+                  return '招标公告', '公开选取机构'
+          elif is_contain_winner(extract_json):
+              num, kw = count_diffser(self.life_dic['招标公告'], text)
+              if re.search(self.wrong_win, text):
+                  return '招标公告', re.search(self.wrong_win, text).group(0)
+              elif num >= 2:
+                  return '招标公告', kw
+              elif re.search('##richTextFetch##', text):
+                  return '', '提取到中标人但包含附件返回空'
+              return '中标信息', '提取到中标人'
+          elif re.search(self.title_life_dic['资审结果'], title+text[:150]) or re.search(self.life_dic['资审结果'], text[:]):
+              return '资审结果', (re.search(self.title_life_dic['资审结果'], title+text[:150]) or re.search(self.life_dic['资审结果'], text[:])).group(0)
+          elif re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'), text[:]):
+              if re.search('意向|预告|变更|更正|中标|中选|成交|答疑|废标|流标|终止', title):
+                  return '', '招标正则召回标题有其他类别关键词,返回空'
+              return '招标公告', (re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'),
+                                                                                    text[:])).group(0)
+          else:
+              return '', '未预测到关键词, 返回空'
+
+      not_extract_dic = {
+          104: '招标文件',
+          106: '法律法规',
+          107: '新闻资讯',
+          108: '拟建项目',
+          109: '展会推广',
+          110: '企业名录',
+          111: '企业资质',
+          112: '全国工程人员',
+          113: '业主采购'
+      }
+      if original_docchannel in not_extract_dic:
+          return {'docchannel': {'docchannel': '', 'doctype': not_extract_dic[original_docchannel]}}
+      if web_source_no in ['02104-7', '04733']: # 这些数据源无法识别
+          return {'docchannel': {'docchannel': '', 'doctype': '采招数据'}}
+
+      title = re.sub('[^\u4e00-\u9fa5]', '', title)
+      if len(title) > 50:
+          title = title[:20] + title[-30:]
+
+      text = html2text(html)
+      prem_json = json.dumps(prem, ensure_ascii=False)
+      result = {'docchannel': {'docchannel': '', 'doctype': ''}}
+
+      doc_type, type_kw = get_type(title, text)
+      doc_life, life_kw = get_life(title, text, prem_json, bidway, original_docchannel)
+      if doc_type in self.title_type_dic:
+          result['docchannel']['doctype'] = doc_type
+      if doc_life in self.title_life_dic:
+          result['docchannel']['docchannel'] = doc_life
+
+      if doc_type=="" or doc_life=="":
+          list_sentence = sorted(list_sentence, key=lambda x:x.sentence_index)
+          token_l = [it.tokens for it in list_sentence]
+          tokens = [it for l in token_l for it in l]
+          content = ' '.join(tokens[:500])
+          data_content, data_title = self.predict_process(docid='', doctitle=title[-50:],
+                                                          dochtmlcon=content)  # 标题最多取50字
+          text_len = len(data_content[0]) if len(data_content[0]) < self.sequen_len else self.sequen_len
+          title_len = len(data_title[0]) if len(data_title[0]) < self.title_len else self.title_len
+
+          array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
+          array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
+
+          if doc_type == "":
+              pred = self.type_sess.run(self.type_softmax,
+                                        feed_dict={
+                                            self.type_title: array_title,
+                                            self.type_content: array_content,
+                                            self.type_mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
+                                            self.type_mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
+                                            self.type_prob: 1}
+                                        )
+              id = np.argmax(pred, axis=1)[0]
+              prob = pred[0][id]
+              result['docchannel']['doctype'] = self.id2type[id]
+              # print('公告类别:', self.id2type[id], '概率:',prob)
+              # if id == 0:
+          if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']:
+              if len(text)>150 and re.search(self.kws, content):
+                  pred = self.lift_sess.run(self.lift_softmax,
+                                            feed_dict={
+                                                self.lift_title: array_title,
+                                                self.lift_content: array_content,
+                                                self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
+                                                self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
+                                                self.lift_prob: 1}
+                                            )
+                  id = np.argmax(pred, axis=1)[0]
+                  prob = pred[0][id]
+                  if self.id2life[id] == '中标信息' and original_docchannel in [52, '52', '招标公告'] and not is_contain_winner(prem_json):
+                      result['docchannel']['docchannel'] = '招标公告'
+                  else:
+                      result['docchannel']['docchannel'] = self.id2life[id]
+                      # print('生命周期:',self.id2life[id], '概率:',prob)
+                      # if id == 6:
+                      if result['docchannel']['docchannel'] == '中标信息':
+                          if self.is_houxuan(''.join([it for it in title if it.isalpha()]),
+                                             ''.join([it for it in content if it.isalpha()])):
+                              result['docchannel']['docchannel'] = '候选人公示'
+                              # return '候选人公示', prob
+                              # return [{'docchannel': '候选人公示'}]
+      # print('公告类型:%s, 生命周期:%s, 关键词:%s '%(doc_type, doc_life, life_kw))
+      # print('result: ', result)
+      return result
+
 # 保证金支付方式提取
 class DepositPaymentWay():
     def __init__(self,):