import os,sys # sys.path.append('/data/python/znj/BIDI_ML_INFO_EXTRACTION/') import numpy as np from BiddingKG.dl.common.models import * # import tensorflow as tf from keras.layers import * from keras.models import Model from keras.utils import Sequence,to_categorical import keras.backend as K # from keras.callbacks import Callback from keras.optimizers import Adam import pandas as pd from sklearn.metrics import classification_report maxlen = 512 words_size = 128 # batch_size = 64 class Attention(Layer): """多头注意力机制 """ def __init__(self, nb_head, size_per_head, **kwargs): self.nb_head = nb_head self.size_per_head = size_per_head self.out_dim = nb_head * size_per_head super(Attention, self).__init__(**kwargs) def build(self, input_shape): super(Attention, self).build(input_shape) q_in_dim = input_shape[0][-1] k_in_dim = input_shape[1][-1] v_in_dim = input_shape[2][-1] self.q_kernel = self.add_weight(name='q_kernel', shape=(q_in_dim, self.out_dim), initializer='glorot_normal') self.k_kernel = self.add_weight(name='k_kernel', shape=(k_in_dim, self.out_dim), initializer='glorot_normal') self.v_kernel = self.add_weight(name='w_kernel', shape=(v_in_dim, self.out_dim), initializer='glorot_normal') def mask(self, x, mask, mode='mul'): if mask is None: return x else: for _ in range(K.ndim(x) - K.ndim(mask)): mask = K.expand_dims(mask, K.ndim(mask)) if mode == 'mul': return x * mask else: return x - (1 - mask) * 1e10 def call(self, inputs): q, k, v = inputs[:3] v_mask, q_mask = None, None if len(inputs) > 3: v_mask = inputs[3] if len(inputs) > 4: q_mask = inputs[4] # 线性变换 qw = K.dot(q, self.q_kernel) kw = K.dot(k, self.k_kernel) vw = K.dot(v, self.v_kernel) # 形状变换 qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head)) kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head)) vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head)) # 维度置换 qw = K.permute_dimensions(qw, (0, 2, 1, 3)) kw = K.permute_dimensions(kw, (0, 2, 1, 3)) vw = K.permute_dimensions(vw, (0, 2, 1, 3)) # Attention a = K.batch_dot(qw, kw, [3, 3]) / self.size_per_head**0.5 a = K.permute_dimensions(a, (0, 3, 2, 1)) a = self.mask(a, v_mask, 'add') a = K.permute_dimensions(a, (0, 3, 2, 1)) a = K.softmax(a) # 完成输出 o = K.batch_dot(a, vw, [3, 2]) o = K.permute_dimensions(o, (0, 2, 1, 3)) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.mask(o, q_mask, 'mul') return o def compute_output_shape(self, input_shape): return (input_shape[0][0], input_shape[0][1], self.out_dim) class OurLayer(Layer): """定义新的Layer,增加reuse方法,允许在定义Layer时调用现成的层 """ def reuse(self, layer, *args, **kwargs): if not layer.built: if len(args) > 0: inputs = args[0] else: inputs = kwargs['inputs'] if isinstance(inputs, list): input_shape = [K.int_shape(x) for x in inputs] else: input_shape = K.int_shape(inputs) layer.build(input_shape) outputs = layer.call(*args, **kwargs) for w in layer.trainable_weights: if w not in self._trainable_weights: self._trainable_weights.append(w) for w in layer.non_trainable_weights: if w not in self._non_trainable_weights: self._non_trainable_weights.append(w) for u in layer.updates: if not hasattr(self, '_updates'): self._updates = [] if u not in self._updates: self._updates.append(u) return outputs class OurBidirectional(OurLayer): """自己封装双向RNN,允许传入mask,保证对齐 """ def __init__(self, layer, **args): super(OurBidirectional, self).__init__(**args) self.forward_layer = layer.__class__.from_config(layer.get_config()) self.backward_layer = layer.__class__.from_config(layer.get_config()) self.forward_layer.name = 'forward_' + self.forward_layer.name self.backward_layer.name = 'backward_' + self.backward_layer.name def reverse_sequence(self, x, mask): """这里的mask.shape是[batch_size, seq_len, 1] """ seq_len = K.round(K.sum(mask, 1)[:, 0]) seq_len = K.cast(seq_len, 'int32') return tf.reverse_sequence(x, seq_len, seq_dim=1) def call(self, inputs): x, mask = inputs x_forward = self.reuse(self.forward_layer, x) x_backward = self.reverse_sequence(x, mask) x_backward = self.reuse(self.backward_layer, x_backward) x_backward = self.reverse_sequence(x_backward, mask) x = K.concatenate([x_forward, x_backward], -1) if K.ndim(x) == 3: return x * mask else: return x def compute_output_shape(self, input_shape): return input_shape[0][:-1] + (self.forward_layer.units * 2,) def classify_model(): num_classes = 6 embed_input = Input(shape=(None,words_size)) mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(embed_input) # mask = Lambda(lambda x: K.cast(K.not_equal(x, np.zeros(words_size,dtype=float)), 'float32'))(embed_input) # test_model = Model([embed_input],mask) input_drop = Dropout(0.25)(embed_input) t = OurBidirectional(GRU(64, return_sequences=True))([input_drop,mask]) h = Attention(8, 16)([t, t, t, mask]) h = Concatenate()([t, h]) # avg = layers.GlobalAveragePooling1D()(h) # output = Dense(num_classes, activation='softmax')(avg) h = Lambda(lambda x: x[0] * x[1])([h, mask]) h_dim = K.int_shape(h)[-1] h = Masking(mask_value=np.zeros(h_dim), input_shape=(maxlen, h_dim))(h) h = Dropout(0.25)(h) gru_output = Bidirectional(GRU(128))(h) output = Dense(num_classes, activation='softmax')(gru_output) # h = Dropout(0.25)(h) # atten = Attention02()(h,mask=K.squeeze(mask,axis=-1)) # output = Dense(num_classes, activation='softmax')(atten) model = Model([embed_input],output) model.summary() learn_rate = 0.0002 model.compile(optimizer=optimizers.Adam(lr=learn_rate), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score]) return model from BiddingKG.dl.common.nerUtils import getTokens import jieba def preprocess(text): text = re.sub("\n+",',',text) text = re.sub("\s+|?+",'',text) text = re.sub("[\.·_]{2,}", ',', text) text = re.sub("_", '', text) text = text[:2500] sentences = text.split("。") sentences = [s+"。" for s in sentences if s] if not sentences: return [] tokens = getTokens(sentences) new_tokens = [] for t in tokens: new_tokens.extend(t) return new_tokens def preprocess2(text): text = re.sub("\n+",',',text) text = re.sub("\s+|?+",'',text) text = re.sub("[\.·_]{2,}", ',', text) text = re.sub("_", '', text) text = text[:2500] tokens = list(jieba.cut(text)) return tokens from BiddingKG.dl.common.Utils import getModel_w2v model_w2v = getModel_w2v() def get_words_matrix(words): if words in model_w2v.vocab: return model_w2v[words] else: return model_w2v['unk'] def data_generate(): train_x = [] train_y = [] train_text = [] # # 数据1 # attachmentcon_list = [] # re_label_list = [] # data = pd.read_excel("attachment_data_relabel01.xlsx") # data = data[data['re_label'] != 6] # attachmentcon_list.extend([i for i in data['attachmentcon']]) # re_label_list.extend([i for i in data['re_label']]) # # filetitle_工程量清单标注.xlsx # data2 = pd.read_excel("filetitle_3.xlsx") # data2 = data2[:1887] # attachmentcon_list.extend([i for i in data2['attachmentcon']]) # re_label_list.extend([i for i in data2['re_label']]) # # filetitle_评标办法.xlsx # data3 = pd.read_excel("filetitle_5.xlsx") # attachmentcon_list.extend([i for i in data3['attachmentcon']]) # re_label_list.extend([i for i in data3['re_label']]) # # filetitle_限价(控制价).xlsx # data7 = pd.read_excel("filetitle_2.xlsx") # attachmentcon_list.extend([i for i in data7['attachmentcon']]) # re_label_list.extend([i for i in data7['re_label']]) # # data4 = pd.read_excel("attachment_data_pred_label2.xlsx") # data4 = data4[(data4['pred_label'] == 2)|(data4['pred_label'] == 5)|(data4['pred_label'] == 6)] # attachmentcon_list.extend([i for i in data4['attachmentcon']]) # re_label_list.extend([i for i in data4['re_label']]) # # data5 = pd.read_excel("attachment_data_nolabel01_test_pred.xlsx") # data5 = data5[(data5['pred_label'] == 5)|(data5['pred_label'] == 6)] # attachmentcon_list.extend([i for i in data5['attachmentcon']]) # re_label_list.extend([i for i in data5['re_label']]) # # filetitle_采购清单.xlsx # data6 = pd.read_excel("filetitle_4.xlsx") # data6 = data6[:900] # for filetitle,attachmentcon,re_label in zip(data6['filetitle'],data6['attachmentcon'],data6['re_label']): # if re_label==6: # re_label = 4 # attachmentcon = filetitle + attachmentcon # attachmentcon_list.append(attachmentcon) # re_label_list.append(re_label) # # data8 = pd.read_excel("attachment_data_relabel01_test_pred2.xlsx") # data8 = data8[(data8['pred_label'] == 5) | (data8['pred_label'] == 6)] # attachmentcon_list.extend([i for i in data8['attachmentcon']]) # re_label_list.extend([i for i in data8['re_label']]) # # for text, label in zip(attachmentcon_list, re_label_list): # text = str(text) # tokens = preprocess2(text) # tokens = tokens[:maxlen] # train_text.append("".join(tokens)) # words_matrix = np.zeros((maxlen, words_size)) # for i in range(len(tokens)): # words_matrix[i] = np.array(get_words_matrix(tokens[i])) # train_x.append(words_matrix) # y = np.zeros(6) # y[int(label)] = 1 # train_y.append(y) # 修正后数据 # data = pd.read_excel("test_pre_result4.xlsx") # 修正后数据(新家“评标结果”) data = pd.read_excel("test_pre_result5.xlsx") for text, label in zip(data['text'], data['re_label']): text = str(text) # tokens = preprocess2(text) # tokens = tokens[:maxlen] # train_text.append("".join(tokens)) # words_matrix = np.zeros((maxlen, words_size)) # for i in range(len(tokens)): # words_matrix[i] = np.array(get_words_matrix(tokens[i])) # train_x.append(words_matrix) # y = np.zeros(6) # y[int(label)] = 1 # train_y.append(y) train_y.append(label) train_x.append(text) # 'filetitle_评标办法222.xlsx' # data2 = pd.read_excel("filetitle_5222.xlsx") # # data2 = data[(data['filetype']!='zip')&(data['filetype']!='rar')] # for text, label in zip(data2['attachmentcon'], data2['re_label']): # text = str(text) # tokens = preprocess2(text) # tokens = tokens[:maxlen] # train_text.append("".join(tokens)) # words_matrix = np.zeros((maxlen, words_size)) # for i in range(len(tokens)): # words_matrix[i] = np.array(get_words_matrix(tokens[i])) # train_x.append(words_matrix) # y = np.zeros(6) # y[int(label)] = 1 # train_y.append(y) # train_y.append(label) # train_x.append(text) # filetitle_pingbiaojieguo0_pred2 评标结果类 # data3 = pd.read_excel("filetitle_pingbiaojieguo0_pred2.xlsx") # data3 = data3[data3['re_label']!=6] # for text, label in zip(data3['attachmentcon'], data3['re_label']): # text = str(text) # tokens = preprocess2(text) # tokens = tokens[:maxlen] # train_text.append("".join(tokens)) # words_matrix = np.zeros((maxlen, words_size)) # for i in range(len(tokens)): # words_matrix[i] = np.array(get_words_matrix(tokens[i])) # train_x.append(words_matrix) # y = np.zeros(6) # y[int(label)] = 1 # train_y.append(y) # train_y.append(label) # train_x.append(text) print("数据总量:",len(train_x)) # train_x = np.array(train_x) # train_y = np.array(train_y) # data_len = len(train_x) # indices = np.random.permutation(data_len) # train_x = train_x[indices] # train_y = train_y[indices] # test_len = int(data_len*0.1) # # test_idx = indices[:test_len] # # train_idx = indices[test_len:] # test_x = train_x[:test_len] # test_y = train_y[:test_len] # print("测试数据量:", len(test_x)) # train_x = train_x[test_len:] # train_y = train_y[test_len:] # print("训练数据量:",len(train_x)) return train_x,train_y,train_text def add_data(): train_x = [] train_y = [] # train_text = [] # add数据 data = pd.read_excel("time_202196_pred3.xlsx") data = data[(data['filetype']!='zip')&(data['filetype']!='rar')] data = data[(data['label_prob']>0.965)|(data['pred_label']==0)] for text, label in zip(data['attachmentcon'], data['pred_label']): text = str(text) # tokens = preprocess2(text) # tokens = tokens[:maxlen] # train_text.append("".join(tokens)) # words_matrix = np.zeros((maxlen, words_size)) # for i in range(len(tokens)): # words_matrix[i] = np.array(get_words_matrix(tokens[i])) # train_x.append(words_matrix) # y = np.zeros(6) # y[int(label)] = 1 # train_y.append(y) train_y.append(label) train_x.append(text) print("add数据量:",len(train_x)) # return train_x,train_y,train_text return train_x,train_y def train_1(): model = classify_model() # 载入数据 train_x, train_y,train_text = data_generate() data_len = len(train_x) # np.random.seed(7) # indices = np.random.permutation(data_len) import random random.seed(7) train_data = [d for d in zip(train_x,train_y,train_text)] random.shuffle(train_data) train_x = np.array([i[0] for i in train_data]) train_y = np.array([i[1] for i in train_data]) train_text = [i[2] for i in train_data] # train_x = train_x[indices] # train_y = train_y[indices] # train_text = [train_text[i] for i in indices.tolist()] test_len = int(data_len * 0.1) test_x = train_x[:test_len] test_y = train_y[:test_len] test_text = train_text[:test_len] print("测试数据量:", len(test_x)) train_x = train_x[test_len:] train_y = train_y[test_len:] train_text = train_text[test_len:] print("训练数据量:", len(train_x)) epochs = 45 batch_size = 256 checkpoint = ModelCheckpoint("model_label_classify3.weights",save_weights_only=True, monitor="val_loss", verbose=1, save_best_only=True, mode='min') model.fit(x=[train_x],y=train_y,validation_data=([test_x],test_y), epochs=epochs,batch_size=batch_size,shuffle=True,class_weight='auto',callbacks=[checkpoint]) model.load_weights("model_label_classify3.weights") y_pre = model.predict([test_x]) # 各类别预测评估 res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1)) print(res1) y_pre2 = model.predict([train_x]) res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1)) print(res2) result_df = pd.DataFrame({"text":test_text+train_text,'y_label':[np.argmax(i) for i in test_y.tolist()+train_y.tolist()],'pre_label':[np.argmax(i) for i in y_pre.tolist()+y_pre2.tolist()]}) result_df['is_same'] = [1 if i==j else 0 for i,j in zip(result_df['y_label'],result_df['pre_label'])] result_df.to_excel("test_pre_result5.xlsx") return model class DataGenerator(Sequence): 'Generates data for Keras' def __init__(self, texts, labels, batch_size=256, dim=(maxlen,words_size), n_classes=6, shuffle=True): 'Initialization' self.dim = dim self.batch_size = batch_size self.labels = labels self.texts = texts self.n_classes = n_classes self.shuffle = shuffle self.on_epoch_end() def __len__(self): 'Denotes the number of batches per epoch' _len = len(self.texts) // self.batch_size if len(self.texts) % self.batch_size != 0: _len += 1 return _len # return int(np.floor(len(self.texts) / self.batch_size)) def __getitem__(self, index): 'Generate one batch of data' # Generate indexes of the batch indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size] # Find list of IDs list_texts = [self.texts[k] for k in indexes] _label = [self.labels[k] for k in indexes] # Generate data X, y = self.__data_generation(list_texts,_label) return X, y def on_epoch_end(self): 'Updates indexes after each epoch' self.indexes = np.arange(len(self.texts)) if self.shuffle == True: np.random.shuffle(self.indexes) def __data_generation(self, list_texts,_label): 'Generates data containing batch_size samples' # Initialization # X = np.empty((self.batch_size, *self.dim)) # y = np.empty((self.batch_size), dtype=int) batch_len = len(list_texts) x = np.empty((batch_len, *self.dim)) y = np.empty((batch_len), dtype=int) # Generate data for i, text in enumerate(list_texts): # Store sample tokens = preprocess2(text) tokens = tokens[:maxlen] words_matrix = np.zeros((maxlen, words_size)) for j in range(len(tokens)): words_matrix[j] = np.array(get_words_matrix(tokens[j])) x[i,] = words_matrix # Store class y[i] = _label[i] return x, to_categorical(y, num_classes=self.n_classes) def train_2(): model = classify_model() # 载入数据 train_x, train_y = add_data() train_x2, train_y2,train_text = data_generate() data_len = len(train_x) import random random.seed(7) # train_data = [d for d in zip(train_x,train_y,train_text)] train_data = [d for d in zip(train_x,train_y)] random.shuffle(train_data) train_x = [i[0] for i in train_data] train_y = [i[1] for i in train_data] # train_text = [i[2] for i in train_data] test_len = int(data_len * 0.1) test_x = train_x[:test_len] test_y = train_y[:test_len] # test_text = train_text[:test_len] print("测试数据量:", len(test_x)) train_x = train_x[test_len:] train_y = train_y[test_len:] # 加上 train_x2, train_y2 train_x.extend(train_x2) train_y.extend(train_y2) # train_x = np.array(train_x) # train_y = np.array(train_y) # train_text = train_text[test_len:] print("训练数据量:", len(train_x)) epochs = 30 batch_size = 256 checkpoint = ModelCheckpoint("model_label_classify6.weights",save_weights_only=True, monitor="val_loss", verbose=1, save_best_only=True, mode='min') # model.fit(x=[train_x],y=train_y,validation_data=([test_x],test_y), # epochs=epochs,batch_size=batch_size,shuffle=True,class_weight='auto',callbacks=[checkpoint]) training_generator = DataGenerator(train_x, train_y) validation_generator = DataGenerator(test_x, test_y) model.fit_generator(generator=training_generator,validation_data=validation_generator, use_multiprocessing=True, workers=3, epochs=epochs,shuffle=True,class_weight='auto',callbacks=[checkpoint]) # model.load_weights("model_label_classify4.weights") # y_pre = model.predict([test_x]) # # 各类别预测评估 # res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1)) # print(res1) # y_pre2 = model.predict([train_x]) # res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1)) # print(res2) # result_df = pd.DataFrame({"text":test_text+train_text,'y_label':[np.argmax(i) for i in test_y.tolist()+train_y.tolist()],'pre_label':[np.argmax(i) for i in y_pre.tolist()+y_pre2.tolist()]}) # result_df['is_same'] = [1 if i==j else 0 for i,j in zip(result_df['y_label'],result_df['pre_label'])] # result_df.to_excel("test_pre_result5.xlsx") return model class Attention02(Layer): def __init__(self, **kwargs): self.init = initializers.get('normal') self.supports_masking = True self.attention_dim = 50 super(Attention02, self).__init__(**kwargs) def build(self, input_shape): assert len(input_shape) == 3 self.W = K.variable(self.init((input_shape[-1], 1))) self.b = K.variable(self.init((self.attention_dim,))) self.u = K.variable(self.init((self.attention_dim, 1))) self.trainable_weights = [self.W, self.b, self.u] super(Attention02, self).build(input_shape) def compute_mask(self, inputs, mask=None): return mask def call(self, x, mask=None): uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b)) ait = K.dot(uit, self.u) ait = K.squeeze(ait, -1) ait = K.exp(ait) if mask is not None: ait = ait * K.cast(mask, K.floatx()) # ait = ait * mask ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx()) ait = K.expand_dims(ait) weighted_input = x * ait output = K.sum(weighted_input, axis=1) return output def compute_output_shape(self, input_shape): return (input_shape[0], input_shape[-1]) def predict_one(text,model): text = str(text) tokens = preprocess2(text) tokens = tokens[:maxlen] words_matrix = np.zeros((maxlen, words_size)) for i in range(len(tokens)): words_matrix[i] = np.array(get_words_matrix(tokens[i])) y = model.predict([np.array([words_matrix])]) y_label = np.argmax(y[0]) prob = y[0][y_label] return y_label,prob def test01(): # test & predict model = classify_model() model.load_weights("model_label_classify6.weights") test_batch_size = 2000 # data = pd.read_csv("attachment_data_relabel01.csv") # data = pd.read_csv("time_202196.csv",chunksize=test_batch_size ) data = pd.read_csv("time_20210923.csv",chunksize=test_batch_size ) # data = pd.read_csv("filetitle_pingbiaojieguo0.csv",chunksize=test_batch_size ) classes_dict = { 0: '其他', 1: '招标文件', 2: '限价(控制价)', 3: '工程量清单', 4: '采购清单', 5: '评标办法' } # data = data[data['new_label'] == 6] # print("test_nums",len(data)) idx = 0 new_df = pd.DataFrame() for df in data: train_x = [] train_text = [] for text in df['attachmentcon']: text = str(text) tokens = preprocess2(text) tokens = tokens[:maxlen] train_text.append("".join(tokens)) words_matrix = np.zeros((maxlen, words_size)) for i in range(len(tokens)): words_matrix[i] = np.array(get_words_matrix(tokens[i])) train_x.append(words_matrix) train_x = np.array(train_x) y_pre = model.predict([train_x]) pred_label = [np.argmax(i) for i in y_pre.tolist()] label_prob = [y[y_label] for y,y_label in zip(y_pre.tolist(),pred_label)] classes = [classes_dict[label] for label in pred_label] text_len = [len(text) for text in train_text] df['pred_label'] = pred_label df['类别'] = classes df['label_prob'] = label_prob df['attachmentcon'] = train_text df['text_len'] = text_len new_df = pd.concat([new_df,df]) idx += test_batch_size print(idx) new_df.to_excel("time_20210923_pred.xlsx") # 大量标注 def data_process4(): model, test_model = classify_model() model.load_weights("model_label_classify6.weights") # data = pd.read_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data02.csv") data = pd.read_excel("C:/Users/Administrator/Desktop/attachment_data/time_202196.xlsx") print(data.info()) pred_label = [] label_prob = [] new_text = [] idx = 0 for text in data['attachmentcon']: print(idx) idx += 1 y_label, prob = predict_one(text, model) pred_label.append(y_label) label_prob.append(prob) data['pred_label'] = pred_label data['label_prob'] = label_prob # data['attachmenthtml'] = [re.sub('\n{2,}','',i.replace("