# coding: UTF-8 import torch import torch.nn as nn import torch.nn.functional as F import time import re import os import transformers from transformers import ElectraTokenizer import numpy as np # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cpu") # 线上用CPU class PositionalEncoding(nn.Module): def __init__(self,dim_hid): super(PositionalEncoding,self).__init__() base_array = np.array([np.power(10000,2*(hid_j//2)/dim_hid) for hid_j in range(dim_hid)]) self.base_tensor = torch.from_numpy(base_array).to(torch.float32).to(device) #[1,D] def forward(self,x): # x(B,N,d) B,N,d = x.shape pos = torch.arange(N).unsqueeze(-1).to(torch.float32).to(device) #[N,1] pos = pos/self.base_tensor pos = pos.unsqueeze(0) pos[:,:,0::2] = torch.sin(pos[:,:,0::2]) pos[:,:,1::2] = torch.cos(pos[:,:,1::2]) return x+pos class ScaledDotProductAttention(nn.Module): ''' Scaled Dot-Product Attention ''' def __init__(self, temperature, attn_dropout=0.1): super().__init__() self.temperature = temperature self.dropout = nn.Dropout(attn_dropout) def forward(self, q, k, v, mask=None): # print(q.shape,k.shape) attn = torch.matmul(q / self.temperature, k.transpose(2, 3)) if mask is not None: attn = attn.masked_fill(mask == 0, -1e9) # t1 = time.time() attn = self.dropout(torch.softmax(attn, dim=-1)) # print('cost',time.time()-t1) # 主要时间花费 output = torch.matmul(attn, v) return output, attn class MultiHeadAttention(nn.Module): ''' Multi-Head Attention module ''' def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): super().__init__() self.n_head = n_head self.d_k = d_k self.d_v = d_v self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False) self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False) self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False) self.fc = nn.Linear(n_head * d_v, d_model, bias=False) self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5) self.dropout = nn.Dropout(dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.rotaryEmbedding = RotaryEmbedding(d_k) def forward(self, q, k, v, mask=None): d_k, d_v, n_head = self.d_k, self.d_v, self.n_head sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1) residual = q # Pass through the pre-attention projection: b x lq x (n*dv) # Separate different heads: b x lq x n x dv q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) # Transpose for attention dot product: b x n x lq x dv q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2) # RoPE embed target_tensor = torch.zeros((q.size(0), q.size(2))) position_ids = torch.arange(q.size(2), dtype=torch.long).unsqueeze(0).expand_as(target_tensor) _cos, _sin = self.rotaryEmbedding(q, position_ids) q, k = apply_rotary_pos_emb(q, k, _cos, _sin) if mask is not None: # mask = mask.unsqueeze(1) # For head axis broadcasting. mask = mask.unsqueeze(1).unsqueeze(2) # For head axis broadcasting. q, attn = self.attention(q, k, v, mask=mask) #q (sz_b,n_head,N=len_q,d_k) #k (sz_b,n_head,N=len_k,d_k) #v (sz_b,n_head,N=len_v,d_v) # Transpose to move the head dimension back: b x lq x n x dv # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv) q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1) #q (sz_b,len_q,n_head,N * d_k) q = self.dropout(self.fc(q)) q += residual q = self.layer_norm(q) return q, attn class PositionwiseFeedForward(nn.Module): ''' A two-feed-forward-layer module ''' def __init__(self, d_in, d_hid, dropout=0.1): super().__init__() self.w_1 = nn.Linear(d_in, d_hid) # position-wise self.w_2 = nn.Linear(d_hid, d_in) # position-wise self.layer_norm = nn.LayerNorm(d_in, eps=1e-6) self.dropout = nn.Dropout(dropout) def forward(self, x): residual = x x = self.w_2(torch.relu(self.w_1(x))) x = self.dropout(x) x += residual x = self.layer_norm(x) return x class RotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): super().__init__() self.dim = dim # it is set to the head_dim self.max_position_embeddings = max_position_embeddings self.base = base # Calculate the theta according to the formula theta_i = base^(2i/dim) where i = 0, 1, 2, ..., dim // 2 inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)) self.register_buffer("inv_freq", tensor=inv_freq, persistent=False) @torch.no_grad() def forward(self, x, position_ids, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] self.inv_freq = self.inv_freq.to(device) position_ids = position_ids.to(device) # Copy the inv_freq tensor for batch in the sequence # inv_freq_expanded: [Batch_Size, Head_Dim // 2, 1] inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) # position_ids_expanded: [Batch_Size, 1, Seq_Len] position_ids_expanded = position_ids[:, None, :].float() # Multiply each theta by the position (which is the argument of the sin and cos functions) # freqs: [Batch_Size, Head_Dim // 2, 1] @ [Batch_Size, 1, Seq_Len] --> [Batch_Size, Seq_Len, Head_Dim // 2] freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) # emb: [Batch_Size, Seq_Len, Head_Dim] emb = torch.cat((freqs, freqs), dim=-1) # cos, sin: [Batch_Size, Seq_Len, Head_Dim] cos = emb.cos() sin = emb.sin() return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) def rotate_half(x): # Build the [-x2, x1, -x4, x3, ...] tensor for the sin part of the positional encoding. x1 = x[..., : x.shape[-1] // 2] # Takes the first half of the last dimension x2 = x[..., x.shape[-1] // 2 :] # Takes the second half of the last dimension return torch.cat((-x2, x1), dim=-1) def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1): cos = cos.unsqueeze(unsqueeze_dim) # Add the head dimension sin = sin.unsqueeze(unsqueeze_dim) # Add the head dimension # Apply the formula (34) of the Rotary Positional Encoding paper. q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed class EncoderLayer(nn.Module): ''' Compose with two layers ''' def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): super(EncoderLayer, self).__init__() self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout) def forward(self, enc_input, slf_attn_mask=None): enc_output, enc_slf_attn = self.slf_attn( enc_input, enc_input, enc_input, mask=slf_attn_mask) enc_output = self.pos_ffn(enc_output) return enc_output, enc_slf_attn class Encoder(nn.Module): ''' A encoder model with self attention mechanism. ''' def __init__( self, n_src_vocab, d_word_vec, n_layers, n_head, d_k, d_v, d_model, d_inner, pad_idx, dropout=0.1, n_position=200, scale_emb=False,embedding=None): super().__init__() if embedding is None: self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec, padding_idx=pad_idx) else: self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec,padding_idx=pad_idx, # _weight=torch.from_numpy(embedding ,)) # _weight=torch.tensor(embedding ,dtype=torch.float64).to(device)) _weight=torch.tensor(embedding)) self.position_enc = PositionalEncoding(d_word_vec) self.dropout = nn.Dropout(p=dropout) self.layer_stack = nn.ModuleList([ EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)]) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.scale_emb = scale_emb self.d_model = d_model def forward(self, src_seq, src_mask, return_attns=False): enc_slf_attn_list = [] # -- Forward enc_output = self.src_word_emb(src_seq) if self.scale_emb: enc_output *= self.d_model ** 0.5 # enc_output = self.dropout(self.position_enc(enc_output)) enc_output = self.dropout(enc_output) enc_output = self.layer_norm(enc_output) for enc_layer in self.layer_stack: enc_output, enc_slf_attn = enc_layer(enc_output, slf_attn_mask=src_mask) enc_slf_attn_list += [enc_slf_attn] if return_attns else [] if return_attns: return enc_output, enc_slf_attn_list return enc_output class bidiBert(nn.Module): def __init__(self, n_src_vocab, d_word_vec, n_layers, n_head, d_k, d_v, d_model, d_inner, pad_idx,n_class,embedding = None): super(bidiBert, self).__init__() self.encoder = Encoder(n_src_vocab, d_word_vec, n_layers, n_head, d_k, d_v, d_model, d_inner, pad_idx,embedding = embedding) # self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec, padding_idx=pad_idx, # # _weight=torch.from_numpy(embedding ,)) # # _weight=torch.tensor(embedding ,dtype=torch.float64).to(device)) # _weight=torch.tensor(embedding)) # self.encoder = nn.LSTM(128, 256, 2, # bidirectional=True, batch_first=True, dropout=0.3) self.dropout = nn.Dropout(p=0.1) self.pooler = nn.Linear(d_inner,d_inner) self.liner = nn.Linear(d_inner,n_class) # self.liner = nn.Linear(256*4,n_class) self.avg_pool1 = nn.AdaptiveAvgPool1d(1) # Max Pooling: nn.AdaptiveMaxPool1d(1) self.avg_pool2 = nn.AdaptiveAvgPool1d(1) def forward(self, input_title,input_doctext): # out = self.encoder(inputs, attention_mask) # input_title = self.src_word_emb(input_title) # input_title,_ = self.encoder(input_title) # input_title = self.encoder(src_seq=input_title[0], src_mask=input_title[1]) # input_title = input_title[:, 1, :] # input_title = torch.mean(input_title,dim=-2) # input_title = self.avg_pool1(input_title.transpose(1, 2)).squeeze(-1) # input_title = torch.tanh(self.pooler(input_title[:,0])) # input_doctext = self.src_word_emb(input_doctext) # input_doctext,_ = self.encoder(input_doctext) input_doctext = self.encoder(src_seq=input_doctext[0], src_mask=input_doctext[1]) # input_doctext = input_doctext[:, 1, :] # input_doctext = torch.mean(input_doctext,dim=-2) # input_doctext = self.avg_pool2(input_doctext.transpose(1, 2)).squeeze(-1) input_doctext = self.pooler(input_doctext[:,0]) input_doctext = self.dropout(input_doctext) input_doctext = torch.tanh(input_doctext) # print('size:',input_title.size(),input_doctext.size()) # out = torch.cat((input_title, input_doctext), dim=-1) out = input_doctext # bs, n, m = out.size() # out = out.view(bs,n*m) out = self.liner(out) out = F.softmax(out, dim=-1) return out phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' '\+86.?1[3-9]\d{9}|' # '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|' '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=1[3-9]\d{9})|' '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|' '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=[2-9]\d{6,7})|' '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?|' '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|' '400\d{7}转\d{1,4}|' '[2-9]\d{6,7}') def text_process(text): text = text.strip() text = re.sub(r'[\000-\010]|[\013-\014]|[\016-\037]',"",text) # 非法字符 text = re.sub("extractJson:|fullTextSeg:","",text) # text = re.sub("[??]{1,}", "", text) text = re.sub("[??]{2,}", "", text) text = re.sub(r'(http[s]?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', "", text) # 网站 text = re.sub(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', "", text)# 邮箱 text = re.sub('[0-9a-zA-Z@#*%=?&~()_|<>/.(){}【】{}\[\]\-]{6,}', "", text) # 编号 text = re.sub(phone,"",text) # 号码 text = re.sub(r'\b\d[-.\s]?\d{3}[-.\s]?\d{4}\b', "", text) # 座机 text = re.sub('0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?', "", text) # 座机 text = re.sub('1(3[0-9]|4[01456879]|5[0-35-9]|6[2567]|7[0-8]|8[0-9]|9[0-35-9])\d{8}', "", text)# 手机号 text = re.sub('&?nbsp;?|&?ensp;?|&?emsp;?', "", text) text = re.sub('\\\\n|\\\\r|\\\\t', "", text) # text = re.sub("\s+", "", text) text = re.sub("\s+", " ", text) # 优化部分未识别表达 text = re.sub("中止", "终止", text) return text label2class_dict = { 0: 51, 1:52 , 2:101, 3:102, 4:103, 5:105, 6:114, 7:118, 8:119, 9:120, 10:121, 11:122 } def channel_predict(title,text): if globals().get("channel_pytorch_model") is None or globals().get("channel_tokenizer") is None: # config config = { # 'n_src_vocab': len(vocab), 'd_word_vec': 128, 'n_layers': 3, 'n_head': 3, 'd_k': 128, 'd_v': 128, 'd_model': 128, 'd_inner': 128, 'pad_idx': 0 } # n_src_vocab = config['n_src_vocab'] d_word_vec = config['d_word_vec'] n_layers = config['n_layers'] n_head = config['n_head'] d_k = config['d_k'] d_v = config['d_v'] d_model = config['d_model'] d_inner = config['d_inner'] pad_idx = config['pad_idx'] n_class = 12 # tokenizer base_model_name = os.path.abspath(os.path.dirname(__file__)) + "/pytorch_model/tokenizer" tokenizer = ElectraTokenizer.from_pretrained(base_model_name) n_src_vocab = len(tokenizer.get_vocab()) # 实例化模型 model_path = os.path.abspath(os.path.dirname(__file__)) + '/pytorch_model/channel.pth' model = bidiBert(n_src_vocab, d_word_vec, n_layers, n_head, d_k, d_v, d_model, d_inner, pad_idx, n_class, embedding=None) model.to(device) model_state = torch.load(model_path, map_location=device) model_state_dict = model.state_dict() pretrained_state_dict = model_state # missing_keys = set(model_state_dict.keys()) - set(pretrained_state_dict.keys()) unexpected_keys = set(pretrained_state_dict.keys()) - set(model_state_dict.keys()) add_kv = [] for k, v in model_state.items(): if k in unexpected_keys: # model_state[k.replace("module.","")] = v add_kv.append([k.replace("module.", ""), v]) for i in add_kv: model_state[i[0]] = i[1] for k in list(unexpected_keys): del model_state[k] model.load_state_dict(model_state) # 将模型设置为评估模式 model.eval() globals()["channel_pytorch_model"] = model globals()["channel_tokenizer"] = tokenizer else: model = globals().get("channel_pytorch_model") tokenizer = globals().get("channel_tokenizer") # process text if title in text: text = text.replace(title, '', 1) text = text_process(text) text = re.sub("##attachment##。?","",text) if len(text)<=100: # 正文内容过短时,不预测 return elif len(text)<=200: # 正文内容过短时,重复正文 text = text * 2 text = text[:2000] title = text_process(title) title = title[:100] text = "公告标题:" + title + "。" + "公告内容:" + text text = text[:2000] # to torch data text = [text] text_max_len = 2000 text = [tokenizer.encode_plus( _t, add_special_tokens=True, # 添加特殊标记,如[CLS]和[SEP] max_length=text_max_len, # 设置最大长度 padding='max_length', # 填充到最大长度 truncation=True, # 截断超过最大长度的文本 return_attention_mask=True, # 返回attention_mask return_tensors='pt' # 返回PyTorch张量 ) for _t in text] text = [torch.LongTensor(np.array([_t['input_ids'].numpy()[0] for _t in text])).to(device), torch.LongTensor(np.array([_t['attention_mask'].numpy()[0] for _t in text])).to(device)] # predict with torch.no_grad(): outputs = model(None, text) predic = torch.max(outputs.data, 1)[1].cpu().numpy() pred_prob = torch.max(outputs.data, 1)[0].cpu().numpy() # print('pred_prob',pred_prob) if pred_prob>0.5: pred_label = predic[0] pred_class = label2class_dict[pred_label] else: return # check if pred_class==101 and re.search("((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示",title): # 纠正部分‘资审结果’模型错误识别为中标 pred_class = 105 return pred_class class_dict = {51: '公告变更', 52: '招标公告', 101: '中标信息', 102: '招标预告', 103: '招标答疑', 104: '招标文件', 105: '资审结果', 106: '法律法规', 107: '新闻资讯', 108: '拟建项目', 109: '展会推广', 110: '企业名录', 111: '企业资质', 112: '全国工程', 113: '业主采购', 114: '采购意向', 115: '拍卖出让', 116: '土地矿产', 117: '产权交易', 118: '废标公告', 119: '候选人公示', 120: '合同公告', 121: '开标记录', 122: '验收合同' } def merge_channel(list_articles,channel_dic,original_docchannel): def merge_rule(title,text,docchannel,pred_channel,channel_dic): front_text_len = len(text)//3 if len(text)>300 else 100 front_text = text[:front_text_len] pred_channel = class_dict[pred_channel] if pred_channel == docchannel: channel_dic['docchannel']['use_original_docchannel'] = 0 else: if pred_channel in ['采购意向','招标预告'] and docchannel in ['采购意向','招标预告']: merge_res = '采购意向' if re.search("意向|意愿",title) or re.search("意向|意愿",front_text) else "招标预告" channel_dic['docchannel']['docchannel'] = merge_res channel_dic['docchannel']['use_original_docchannel'] = 0 elif pred_channel in ['公告变更','招标答疑'] and docchannel in ['公告变更','招标答疑']: channel_dic['docchannel']['docchannel'] = docchannel channel_dic['docchannel']['use_original_docchannel'] = 0 elif pred_channel=='公告变更' and docchannel in ['中标信息','废标公告','候选人公示','合同公告']: #中标类的变更还是中标类公告 channel_dic['docchannel']['docchannel'] = docchannel channel_dic['docchannel']['use_original_docchannel'] = 0 elif docchannel=='公告变更' and pred_channel in ['中标信息','废标公告','候选人公示','合同公告']: channel_dic['docchannel']['docchannel'] = pred_channel channel_dic['docchannel']['use_original_docchannel'] = 0 else: channel_dic = {'docchannel': {'doctype': '采招数据', 'docchannel': class_dict.get(original_docchannel, '原始类别'), 'life_docchannel': class_dict.get(original_docchannel, '原始类别')}} channel_dic['docchannel']['use_original_docchannel'] = 1 return channel_dic article = list_articles[0] title = article.title text = article.content doctype = channel_dic['docchannel']['doctype'] docchannel = channel_dic['docchannel']['docchannel'] # print('doctype',doctype,'docchannel',docchannel) compare_type = ['公告变更','招标公告','中标信息','招标预告','招标答疑','资审结果','采购意向','废标公告','候选人公示', '合同公告','开标记录','验收合同'] # 仅比较部分数据 if doctype=='采招数据' and docchannel in compare_type: if not re.search("单一来源",title) and not re.search("单一来源",text[:100]): pred = channel_predict(title, text) # print('pred_res', pred) if pred is not None and original_docchannel: # 无original_docchannel时不进行对比校正 # if class_dict[pred] == docchannel: # channel_dic['docchannel']['use_original_docchannel'] = 0 # else: # channel_dic = {'docchannel': {'docchannel': '采招数据', # 'doctype': class_dict.get(original_docchannel, '原始类别'), # 'life_docchannel': class_dict.get(original_docchannel, '原始类别')}} # channel_dic['docchannel']['use_original_docchannel'] = 1 channel_dic = merge_rule(title,text,docchannel,pred,channel_dic) elif doctype=='采招数据' and docchannel=="": pred = channel_predict(title, text) if pred is not None: pred = class_dict[pred] channel_dic['docchannel']['docchannel'] = pred channel_dic['docchannel']['use_original_docchannel'] = 0 return channel_dic if __name__ == '__main__': title = '关于【2024年四好农村路大中村药红路、空坦路延伸段设计服务】无效项目的公示' text = '''关于【2024年四好农村路大中村药红路、空坦路延伸段设计服务】无效项目的公示 点击查看招标公告 关于【2024年四好农村路大中村药红路、空坦路延伸段设计服务】无效项目的公示 项目名称 2024年四好农村路大中村药红路、空坦路延伸段设计服务, 采购人 重庆市巴南区人民政府莲花街道办事处, 选取方式 直接选取, 是否重新发布招标公告 是 ,无效类型 项目取消, 无效原因 资质设置错误,附件已盖章上传 ,无效时间 2024-10-21 ,公示附件 大中村设计变更.jpg''' pred_class = channel_predict(title,text) print(pred_class) # pred_class2 = channel_predict(title,text) # print(pred_class2) pass