luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599
							# coding: UTF-8
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import re
import os
import transformers
from transformers import ElectraTokenizer
import numpy as np

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu") # 线上用CPU

class PositionalEncoding(nn.Module):

    def __init__(self,dim_hid):
        super(PositionalEncoding,self).__init__()
        base_array = np.array([np.power(10000,2*(hid_j//2)/dim_hid) for hid_j in range(dim_hid)])
        self.base_tensor = torch.from_numpy(base_array).to(torch.float32).to(device) #[1,D]

    def forward(self,x):
        # x(B,N,d)
        B,N,d = x.shape
        pos = torch.arange(N).unsqueeze(-1).to(torch.float32).to(device) #[N,1]
        pos = pos/self.base_tensor
        pos = pos.unsqueeze(0)
        pos[:,:,0::2] = torch.sin(pos[:,:,0::2])
        pos[:,:,1::2] = torch.cos(pos[:,:,1::2])
        return x+pos

class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)

    def forward(self, q, k, v, mask=None):

        # print(q.shape,k.shape)
        attn = torch.matmul(q / self.temperature, k.transpose(2, 3))
        if mask is not None:
            attn = attn.masked_fill(mask == 0, -1e9)
        # t1 = time.time()
        attn = self.dropout(torch.softmax(attn, dim=-1))
        # print('cost',time.time()-t1) # 主要时间花费
        output = torch.matmul(attn, v)

        return output, attn


class MultiHeadAttention(nn.Module):
    ''' Multi-Head Attention module '''

    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
        self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
        self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
        self.fc = nn.Linear(n_head * d_v, d_model, bias=False)

        self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)

        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)

        self.rotaryEmbedding = RotaryEmbedding(d_k)

    def forward(self, q, k, v, mask=None):

        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)

        residual = q


        # Pass through the pre-attention projection: b x lq x (n*dv)
        # Separate different heads: b x lq x n x dv
        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)


        # Transpose for attention dot product: b x n x lq x dv
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

        # RoPE embed
        target_tensor = torch.zeros((q.size(0), q.size(2)))
        position_ids = torch.arange(q.size(2), dtype=torch.long).unsqueeze(0).expand_as(target_tensor)
        _cos, _sin = self.rotaryEmbedding(q, position_ids)
        q, k = apply_rotary_pos_emb(q, k, _cos, _sin)

        if mask is not None:
            # mask = mask.unsqueeze(1)   # For head axis broadcasting.
            mask = mask.unsqueeze(1).unsqueeze(2)   # For head axis broadcasting.

        q, attn = self.attention(q, k, v, mask=mask)

        #q (sz_b,n_head,N=len_q,d_k)
        #k (sz_b,n_head,N=len_k,d_k)
        #v (sz_b,n_head,N=len_v,d_v)

        # Transpose to move the head dimension back: b x lq x n x dv
        # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
        q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)

        #q (sz_b,len_q,n_head,N * d_k)
        q = self.dropout(self.fc(q))
        q += residual

        q = self.layer_norm(q)

        return q, attn


class PositionwiseFeedForward(nn.Module):
    ''' A two-feed-forward-layer module '''

    def __init__(self, d_in, d_hid, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_in, d_hid) # position-wise
        self.w_2 = nn.Linear(d_hid, d_in) # position-wise
        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        residual = x
        x = self.w_2(torch.relu(self.w_1(x)))
        x = self.dropout(x)
        x += residual
        x = self.layer_norm(x)

        return x


class RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()

        self.dim = dim # it is set to the head_dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base

        # Calculate the theta according to the formula theta_i = base^(2i/dim) where i = 0, 1, 2, ..., dim // 2
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)

    @torch.no_grad()
    def forward(self, x, position_ids, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        self.inv_freq = self.inv_freq.to(device)
        position_ids = position_ids.to(device)
        # Copy the inv_freq tensor for batch in the sequence
        # inv_freq_expanded: [Batch_Size, Head_Dim // 2, 1]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        # position_ids_expanded: [Batch_Size, 1, Seq_Len]
        position_ids_expanded = position_ids[:, None, :].float()

        # Multiply each theta by the position (which is the argument of the sin and cos functions)
        # freqs: [Batch_Size, Head_Dim // 2, 1] @ [Batch_Size, 1, Seq_Len] --> [Batch_Size, Seq_Len, Head_Dim // 2]
        freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
        # emb: [Batch_Size, Seq_Len, Head_Dim]
        emb = torch.cat((freqs, freqs), dim=-1)
        # cos, sin: [Batch_Size, Seq_Len, Head_Dim]
        cos = emb.cos()
        sin = emb.sin()
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


def rotate_half(x):
    # Build the [-x2, x1, -x4, x3, ...] tensor for the sin part of the positional encoding.
    x1 = x[..., : x.shape[-1] // 2] # Takes the first half of the last dimension
    x2 = x[..., x.shape[-1] // 2 :] # Takes the second half of the last dimension
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
    cos = cos.unsqueeze(unsqueeze_dim) # Add the head dimension
    sin = sin.unsqueeze(unsqueeze_dim) # Add the head dimension
    # Apply the formula (34) of the Rotary Positional Encoding paper.
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


class EncoderLayer(nn.Module):
    ''' Compose with two layers '''

    def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)

    def forward(self, enc_input, slf_attn_mask=None):
        enc_output, enc_slf_attn = self.slf_attn(
            enc_input, enc_input, enc_input, mask=slf_attn_mask)
        enc_output = self.pos_ffn(enc_output)
        return enc_output, enc_slf_attn


class Encoder(nn.Module):
    ''' A encoder model with self attention mechanism. '''

    def __init__(
            self, n_src_vocab, d_word_vec, n_layers, n_head, d_k, d_v,
            d_model, d_inner, pad_idx, dropout=0.1, n_position=200, scale_emb=False,embedding=None):

        super().__init__()

        if embedding is None:
            self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec, padding_idx=pad_idx)
        else:
            self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec,padding_idx=pad_idx,
                                             # _weight=torch.from_numpy(embedding ,))
                                             # _weight=torch.tensor(embedding ,dtype=torch.float64).to(device))
                                             _weight=torch.tensor(embedding))
        self.position_enc = PositionalEncoding(d_word_vec)
        self.dropout = nn.Dropout(p=dropout)
        self.layer_stack = nn.ModuleList([
            EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
            for _ in range(n_layers)])
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.scale_emb = scale_emb
        self.d_model = d_model

    def forward(self, src_seq, src_mask, return_attns=False):

        enc_slf_attn_list = []

        # -- Forward
        enc_output = self.src_word_emb(src_seq)
        if self.scale_emb:
            enc_output *= self.d_model ** 0.5

        # enc_output = self.dropout(self.position_enc(enc_output))
        enc_output = self.dropout(enc_output)

        enc_output = self.layer_norm(enc_output)

        for enc_layer in self.layer_stack:
            enc_output, enc_slf_attn = enc_layer(enc_output, slf_attn_mask=src_mask)
            enc_slf_attn_list += [enc_slf_attn] if return_attns else []

        if return_attns:
            return enc_output, enc_slf_attn_list
        return enc_output

class bidiBert(nn.Module):

    def __init__(self, n_src_vocab, d_word_vec, n_layers, n_head, d_k, d_v,
                 d_model, d_inner, pad_idx,n_class,embedding = None):
        super(bidiBert, self).__init__()

        self.encoder = Encoder(n_src_vocab, d_word_vec, n_layers, n_head, d_k, d_v,
                               d_model, d_inner, pad_idx,embedding = embedding)
        # self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec, padding_idx=pad_idx,
        #                                  # _weight=torch.from_numpy(embedding ,))
        #                                  # _weight=torch.tensor(embedding ,dtype=torch.float64).to(device))
        #                                  _weight=torch.tensor(embedding))
        # self.encoder =  nn.LSTM(128, 256, 2,
        #                     bidirectional=True, batch_first=True, dropout=0.3)

        self.dropout = nn.Dropout(p=0.1)
        self.pooler = nn.Linear(d_inner,d_inner)
        self.liner = nn.Linear(d_inner,n_class)
        # self.liner = nn.Linear(256*4,n_class)
        self.avg_pool1 = nn.AdaptiveAvgPool1d(1) # Max Pooling: nn.AdaptiveMaxPool1d(1)
        self.avg_pool2 = nn.AdaptiveAvgPool1d(1)

    def forward(self, input_title,input_doctext):
        # out = self.encoder(inputs, attention_mask)

        # input_title = self.src_word_emb(input_title)
        # input_title,_ = self.encoder(input_title)
        # input_title = self.encoder(src_seq=input_title[0], src_mask=input_title[1])
        # input_title = input_title[:, 1, :]
        # input_title = torch.mean(input_title,dim=-2)
        # input_title = self.avg_pool1(input_title.transpose(1, 2)).squeeze(-1)
        # input_title = torch.tanh(self.pooler(input_title[:,0]))

        # input_doctext = self.src_word_emb(input_doctext)
        # input_doctext,_ = self.encoder(input_doctext)
        input_doctext = self.encoder(src_seq=input_doctext[0], src_mask=input_doctext[1])
        # input_doctext = input_doctext[:, 1, :]
        # input_doctext = torch.mean(input_doctext,dim=-2)
        # input_doctext = self.avg_pool2(input_doctext.transpose(1, 2)).squeeze(-1)
        input_doctext = self.pooler(input_doctext[:,0])
        input_doctext = self.dropout(input_doctext)
        input_doctext = torch.tanh(input_doctext)

        # print('size:',input_title.size(),input_doctext.size())
        # out = torch.cat((input_title, input_doctext), dim=-1)
        out = input_doctext
        # bs, n, m = out.size()
        # out = out.view(bs,n*m)
        out = self.liner(out)
        out = F.softmax(out, dim=-1)
        return out


phone = re.compile('1[3-9][0-9][-—－―]?\d{4}[-—－―]?\d{4}|'
                       '\+86.?1[3-9]\d{9}|'
                       # '0[^0]\d{1,2}[-—－―][1-9]\d{6,7}/[1-9]\d{6,10}|'
                       '0[1-9]\d{1,2}[-—－―][2-9]\d{6}\d?[-—－―]\d{1,4}|'
                       '0[1-9]\d{1,2}[-—－―]{0,2}[2-9]\d{6}\d?(?=1[3-9]\d{9})|'
                       '0[1-9]\d{1,2}[-—－―]{0,2}[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—－―]?[2-9]\d{6}\d?)|'
                       '0[1-9]\d{1,2}[-—－―]{0,2}[2-9]\d{6}\d?(?=[2-9]\d{6,7})|'
                       '0[1-9]\d{1,2}[-—－―]{0,2}[2-9]\d{6}\d?|'
                       '[\（|\(]0[1-9]\d{1,2}[\）|\)]-?[2-9]\d{6}\d?-?\d{,4}|'
                       '400\d{7}转\d{1,4}|'
                       '[2-9]\d{6,7}')
def text_process(text):
    text = text.strip()
    text = re.sub(r'[\000-\010]|[\013-\014]|[\016-\037]',"",text) # 非法字符
    text = re.sub("extractJson:|fullTextSeg:","",text)

    # text = re.sub("[?？]{1,}", "", text)
    text = re.sub("[?？]{2,}", "", text)
    text = re.sub(r'(http[s]?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', "", text) # 网站
    text = re.sub(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', "", text)# 邮箱
    text = re.sub('[0-9a-zA-Z@#*%=?&~()_|<>/.（）｛｝【】{}\[\]\-]{6,}', "", text) # 编号

    text = re.sub(phone,"",text) # 号码

    text = re.sub(r'\b\d[-.\s]?\d{3}[-.\s]?\d{4}\b', "", text) # 座机
    text = re.sub('0[1-9]\d{1,2}[-—－―]{0,2}[2-9]\d{6}\d?', "", text) # 座机
    text = re.sub('1(3[0-9]|4[01456879]|5[0-35-9]|6[2567]|7[0-8]|8[0-9]|9[0-35-9])\d{8}', "", text)# 手机号
    text = re.sub('&?nbsp;?|&?ensp;?|&?emsp;?', "", text)
    text = re.sub('\\\\n|\\\\r|\\\\t', "", text)
    # text = re.sub("\s+", "", text)
    text = re.sub("\s+", " ", text)

    # 优化部分未识别表达
    text = re.sub("中止", "终止", text)
    text = re.sub("遴选", "招标", text)

    return text

label2class_dict = {
        0: 51, 1:52 , 2:101,
         3:102,  4:103, 5:105,
         6:114,  7:118,  8:119,
         9:120,  10:121,  11:122
}

def channel_predict(title,text):
    if globals().get("channel_pytorch_model") is None or globals().get("channel_tokenizer") is None:
        # config
        config = {
            # 'n_src_vocab': len(vocab),
            'd_word_vec': 128,
            'n_layers': 3,
            'n_head': 3,
            'd_k': 128,
            'd_v': 128,
            'd_model': 128,
            'd_inner': 128,
            'pad_idx': 0
        }
        # n_src_vocab = config['n_src_vocab']
        d_word_vec = config['d_word_vec']
        n_layers = config['n_layers']
        n_head = config['n_head']
        d_k = config['d_k']
        d_v = config['d_v']
        d_model = config['d_model']
        d_inner = config['d_inner']
        pad_idx = config['pad_idx']
        n_class = 12

        # tokenizer
        base_model_name = os.path.abspath(os.path.dirname(__file__)) + "/pytorch_model/tokenizer"
        tokenizer = ElectraTokenizer.from_pretrained(base_model_name)
        n_src_vocab = len(tokenizer.get_vocab())

        # 实例化模型
        model_path = os.path.abspath(os.path.dirname(__file__)) + '/pytorch_model/channel.pth'
        model = bidiBert(n_src_vocab, d_word_vec, n_layers, n_head, d_k, d_v,
                     d_model, d_inner, pad_idx, n_class, embedding=None)
        model.to(device)

        model_state = torch.load(model_path, map_location=device)
        model_state_dict = model.state_dict()
        pretrained_state_dict = model_state
        # missing_keys = set(model_state_dict.keys()) - set(pretrained_state_dict.keys())
        unexpected_keys = set(pretrained_state_dict.keys()) - set(model_state_dict.keys())

        add_kv = []
        for k, v in model_state.items():
            if k in unexpected_keys:
                # model_state[k.replace("module.","")] = v
                add_kv.append([k.replace("module.", ""), v])
        for i in add_kv:
            model_state[i[0]] = i[1]
        for k in list(unexpected_keys):
            del model_state[k]
        model.load_state_dict(model_state)

        # 将模型设置为评估模式
        model.eval()
        globals()["channel_pytorch_model"] = model
        globals()["channel_tokenizer"] = tokenizer
    else:
        model = globals().get("channel_pytorch_model")
        tokenizer = globals().get("channel_tokenizer")


    # process text
    if title in text:
        text = text.replace(title, '', 1)
    if "##attachment##" in text:
        main_text,attachment_text = text.split("##attachment##",maxsplit=1)
        # print('main_text',main_text)
        if len(main_text)>=500: # 正文有足够的内容时不需要使用附件预测
            text = main_text
    text = re.sub("##attachment##。?","",text)
    text = text_process(text)

    if len(text)<=100:
        # 正文内容过短时，不预测
        return
    elif len(text)<=150:
        # 正文内容过短时，重复正文
        text = text * 2
    text = text[:2000]
    title = text_process(title)
    title = title[:100]
    text = "公告标题：" + title + "。" + "公告内容：" + text
    text = text[:2000]
    # print('predict text:',text)

    # to torch data
    text = [text]
    text_max_len = 2000
    text = [tokenizer.encode_plus(
        _t,
        add_special_tokens=True,  # 添加特殊标记，如[CLS]和[SEP]
        max_length=text_max_len,  # 设置最大长度
        padding='max_length',  # 填充到最大长度
        truncation=True,  # 截断超过最大长度的文本
        return_attention_mask=True,  # 返回attention_mask
        return_tensors='pt'  # 返回PyTorch张量
    ) for _t in text]
    text = [torch.LongTensor(np.array([_t['input_ids'].numpy()[0] for _t in text])).to(device),
         torch.LongTensor(np.array([_t['attention_mask'].numpy()[0] for _t in text])).to(device)]
    # predict
    with torch.no_grad():
        outputs = model(None, text)
        predic = torch.max(outputs.data, 1)[1].cpu().numpy()
        pred_prob = torch.max(outputs.data, 1)[0].cpu().numpy()
        # print('pred_prob',pred_prob)
        if pred_prob>0.5:
            pred_label = predic[0]
            pred_class = label2class_dict[pred_label]
        else:
            return
    # print('check rule before',pred_class)
    # check rule
    if pred_class==101 and re.search("((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示",title): # 纠正部分‘资审结果’模型错误识别为中标
        pred_class = 105
    elif pred_class==122 and re.search("验收服务",title):
        pred_class = None
    # elif pred_class==118 and re.search("重新招标",title): #重新招标类公告，因之前公告的废标原因而错识别为废标公告
    #     pred_class = 52


    return pred_class


class_dict = {51: '公告变更',
       52: '招标公告',
       101: '中标信息',
       102: '招标预告',
       103: '招标答疑',
       104: '招标文件',
       105: '资审结果',
       106: '法律法规',
       107: '新闻资讯',
       108: '拟建项目',
       109: '展会推广',
       110: '企业名录',
       111: '企业资质',
       112: '全国工程',
       113: '业主采购',
       114: '采购意向',
       115: '拍卖出让',
       116: '土地矿产',
       117: '产权交易',
       118: '废标公告',
       119: '候选人公示',
       120: '合同公告',
       121: '开标记录',
       122: '验收合同'
              }

tenderee_type = ['公告变更','招标公告','招标预告','招标答疑','资审结果','采购意向']
win_type = ['中标信息','废标公告','候选人公示','合同公告','开标记录','验收合同']

def merge_channel(list_articles,channel_dic,original_docchannel):

    def merge_rule(title,text,docchannel,pred_channel,channel_dic,original_docchannel):
        front_text_len = len(text)//3 if len(text)>300 else 100
        front_text = text[:front_text_len]
        pred_channel = class_dict[pred_channel]
        if pred_channel == docchannel:
            channel_dic['docchannel']['use_original_docchannel'] = 0
        else:
            if pred_channel in ['采购意向','招标预告'] and docchannel in ['采购意向','招标预告']:
                merge_res = '采购意向' if re.search("意向|意愿",title) or re.search("意向|意愿",front_text) else "招标预告"
                channel_dic['docchannel']['docchannel'] = merge_res
                channel_dic['docchannel']['use_original_docchannel'] = 0
            elif pred_channel in ['公告变更','招标答疑'] and docchannel in ['公告变更','招标答疑']:
                channel_dic['docchannel']['docchannel'] = docchannel
                channel_dic['docchannel']['use_original_docchannel'] = 0
            elif pred_channel=='公告变更' and docchannel in ['中标信息','废标公告','候选人公示','合同公告']: #中标类的变更还是中标类公告
                channel_dic['docchannel']['docchannel'] = docchannel
                channel_dic['docchannel']['use_original_docchannel'] = 0
            elif docchannel=='公告变更' and pred_channel in ['中标信息','废标公告','候选人公示','合同公告']:
                channel_dic['docchannel']['docchannel'] = pred_channel
                channel_dic['docchannel']['use_original_docchannel'] = 0

            else:
                original_type = class_dict.get(original_docchannel, '原始类别')
                if pred_channel in tenderee_type and docchannel in tenderee_type and original_type not in tenderee_type:
                    # pred_channel和docchannel都是同一（招标/中标）类型时，original_docchannel不一致时不使用原网类型
                    channel_dic['docchannel']['use_original_docchannel'] = 0
                elif pred_channel in win_type and docchannel in win_type and original_type not in win_type:
                    # pred_channel和docchannel都是同一（招标/中标）类型时，original_docchannel不一致时不使用原网类型
                    channel_dic['docchannel']['use_original_docchannel'] = 0
                else:
                    channel_dic = {'docchannel': {'doctype': '采招数据',
                                                  'docchannel': original_type,
                                                  'life_docchannel': original_type}}
                    channel_dic['docchannel']['use_original_docchannel'] = 1

        return channel_dic


    article = list_articles[0]
    title = article.title
    text = article.content

    doctype = channel_dic['docchannel']['doctype']
    docchannel = channel_dic['docchannel']['docchannel']
    # print('doctype',doctype,'docchannel',docchannel,'original_docchannel',original_docchannel)
    compare_type = ['公告变更','招标公告','中标信息','招标预告','招标答疑','资审结果','采购意向','废标公告','候选人公示',
                      '合同公告','开标记录','验收合同']
    # 仅比较部分数据
    if doctype=='采招数据' and docchannel in compare_type:
        if not re.search("单一来源",title) and not re.search("单一来源",text[:100]):
            pred = channel_predict(title, text)
            # print('pred_res', pred)
            if pred is not None and original_docchannel: # 无original_docchannel时不进行对比校正
                channel_dic = merge_rule(title,text,docchannel,pred,channel_dic,original_docchannel)

    elif doctype=='采招数据' and docchannel=="":
        pred = channel_predict(title, text)
        if pred is not None:
            pred = class_dict[pred]
            channel_dic['docchannel']['docchannel'] = pred
            channel_dic['docchannel']['use_original_docchannel'] = 0

    # '招标预告'类 规则纠正
    if channel_dic['docchannel']['doctype']=='采招数据' and channel_dic['docchannel']['docchannel']=="招标公告":
        if "##attachment##" in text:
            main_text, attachment_text = text.split("##attachment##", maxsplit=1)
        else:
            main_text = text
        main_text = text_process(main_text)
        if re.search("采购实施月份|采购月份|预计(招标|采购|发标|发包)(时间|月份)|招标公告预计发布时间",main_text[:len(main_text)//2]):
            front_text_len = len(main_text) // 3 if len(main_text) > 300 else 100
            front_text = main_text[:front_text_len]
            if re.search("意向|意愿",title) or re.search("意向|意愿",front_text):
                channel_dic['docchannel']['docchannel'] = "采购意向"
            else:
                channel_dic['docchannel']['docchannel'] = "招标预告"
            channel_dic['docchannel']['use_original_docchannel'] = 0

    return channel_dic


if __name__ == '__main__':
    title = '关于【2024年四好农村路大中村药红路、空坦路延伸段设计服务】无效项目的公示'
    text = '''关于【2024年四好农村路大中村药红路、空坦路延伸段设计服务】无效项目的公示 点击查看招标公告 关于【2024年四好农村路大中村药红路、空坦路延伸段设计服务】无效项目的公示 项目名称 2024年四好农村路大中村药红路、空坦路延伸段设计服务， 采购人 重庆市巴南区人民政府莲花街道办事处， 选取方式 直接选取， 是否重新发布招标公告 是 ，无效类型 项目取消， 无效原因 资质设置错误，附件已盖章上传 ，无效时间 2024-10-21 ，公示附件 大中村设计变更.jpg'''
    pred_class = channel_predict(title,text)
    print(pred_class)
    # pred_class2 = channel_predict(title,text)
    # print(pred_class2)

    pass