import sys import os sys.path.append(os.path.abspath("../..")) # sys.path.append('/data/python_znj/znj/BIDI_ML_INFO_EXTRACTION/') import pandas as pd import re import psycopg2 from keras.callbacks import ModelCheckpoint from keras import layers,models,optimizers,losses from keras.layers import * from BiddingKG.dl.common.Utils import * from BiddingKG.dl.common.models import * from sklearn.metrics import classification_report from sklearn.utils import shuffle,class_weight import matplotlib.pyplot as plt import random input_shape = (2,30,60) input_shape2 = (2,40,128) # output_shape = [4] time_label_dict = { 'time': 0, 'time_release': 1, #发布时间 'time_bidopen': 2, #开标时间 'time_bidclose': 3, #截标时间 'time_bidstart': 12, #投标(开始)时间、响应文件接收(开始)时间 'time_publicityStart': 4, #公示开始时间(公示时间、公示期) 'time_publicityEnd': 5, #公示截止时间 'time_getFileStart': 6, #文件获取开始时间(文件获取时间) 'time_getFileEnd': 7, #文件获取截止时间 'time_registrationStart': 8, #报名开始时间(报名时间) 'time_registrationEnd': 9, #报名截止时间 'time_earnestMoneyStart': 10, #保证金递交开始时间(保证金递交时间) 'time_earnestMoneyEnd': 11, #保证金递交截止时间 'time_commencement': 13, #开工日期 'time_completion': 14 #竣工日期 } output_shape = [len(time_label_dict)] def get_data(): data_load = pd.read_csv("newdata_30_prc.csv", index_col=0) id_set = set() for id in data_load['document_id']: id_set.add(id) conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.103") sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value " \ "FROM corpus_iedocument A,brat_bratannotation B " \ "WHERE A.human_identifier = '%s' " \ "AND A.human_identifier = B.document_id " db_data = [] count = 0 for id in list(id_set): count+=1 print(count) cur1 = conn.cursor() cur1.execute(sql % (id)) db_data.extend(cur1.fetchall()) cur1.close() conn.close() columns = ['document_id','sentences','tokens','offsets_to_text','value'] df = pd.DataFrame(db_data, columns=columns) df = df[df['value'].str.contains('time')] df = df.reset_index(drop=True) print(len(df)) time_label = df['value'].str.split(expand=True) time_label.columns = ['_', 'label_type', 'begin_index', 'end_index', 'entity_text'] time_label = time_label.drop('_', axis=1) df = pd.concat([df, time_label], axis=1) print(df.info()) df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']] df['sentences'] = [eval(sentence) for sentence in df['sentences']] # df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']] # df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']] df['offsets_to_text'] = [eval(offset) for offset in df['offsets_to_text']] # df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']] # df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']] save(df,'db_time_data.pk') def getModel(): ''' @summary: 时间分类模型 ''' L_input = layers.Input(shape=input_shape2[1:], dtype='float32') R_input = layers.Input(shape=input_shape2[1:], dtype='float32') L_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(L_input) # L_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(L_input) avg_l = layers.GlobalAveragePooling1D()(L_lstm) R_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(R_input) # R_lstm = layers.LSTM(32, return_sequences=True, dropout=0.2)(R_input) avg_r = layers.GlobalAveragePooling1D()(R_lstm) concat = layers.merge([avg_l, avg_r], mode='concat') # lstm = layers.LSTM(24,return_sequences=False,dropout=0.2)(concat) output = layers.Dense(output_shape[0],activation="softmax")(concat) model = models.Model(inputs=[L_input,R_input], outputs=output) learn_rate = 0.0005 model.compile(optimizer=optimizers.Adam(lr=learn_rate), loss=losses.binary_crossentropy, metrics=[precision,recall,f1_score]) model.summary() return model def getModel2(): ''' @summary: 时间分类模型 ''' L_input = layers.Input(shape=input_shape2[1:], dtype='float32') L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input) R_input = layers.Input(shape=input_shape2[1:], dtype='float32') R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input) L_input_drop = Dropout(0.3)(L_input) R_input_drop = Dropout(0.3)(R_input) # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input) L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask]) L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1)) # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input) R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask]) R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1)) L_R = layers.merge([L_lstm, R_lstm],concat_axis=1, mode='concat') L_R_mask = layers.merge([L_mask, R_mask],concat_axis=1, mode='concat') L_R_att = Attention02()(L_R,mask=K.squeeze(L_R_mask,axis=-1)) L_att = layers.add([L_att,L_R_att]) R_att = layers.add([R_att,L_R_att]) concat = layers.merge([L_att, R_att], mode='concat') concat = Dropout(0.2)(concat) output = layers.Dense(output_shape[0],activation="softmax")(concat) model = models.Model(inputs=[L_input,R_input], outputs=output) learn_rate = 0.00005 model.compile(optimizer=optimizers.Adam(lr=learn_rate), loss=losses.binary_crossentropy, metrics=[precision,recall,f1_score]) model.summary() return model # def getModel2(): # ''' # @summary: 时间分类模型 # ''' # L_input = layers.Input(shape=input_shape2[1:], dtype='float32') # L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input) # R_input = layers.Input(shape=input_shape2[1:], dtype='float32') # R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input) # # L_input_drop = Dropout(0.3)(L_input) # R_input_drop = Dropout(0.3)(R_input) # # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input) # L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask]) # L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1)) # # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input) # R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask]) # R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1)) # concat = layers.merge([L_att, R_att], mode='concat') # # concat = Dropout(0.2)(concat) # output = layers.Dense(output_shape[0],activation="softmax")(concat) # # model = models.Model(inputs=[L_input,R_input], outputs=output) # # learn_rate = 0.00005 # model.compile(optimizer=optimizers.Adam(lr=learn_rate), # loss=losses.binary_crossentropy, # metrics=[precision,recall,f1_score]) # model.summary() # return model def getModel3(): ''' @summary: 时间分类模型 ''' L_input = layers.Input(shape=input_shape2[1:], dtype='float32') L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input) R_input = layers.Input(shape=input_shape2[1:], dtype='float32') R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input) L_input_drop = Dropout(0.3)(L_input) R_input_drop = Dropout(0.3)(R_input) # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input) L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask]) # L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1)) # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input) R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask]) concat = layers.merge([L_lstm,R_lstm], mode='concat',concat_axis=1) concat_mask = layers.merge([L_mask,R_mask], mode='concat',concat_axis=1) att = Attention02()(concat,mask=K.squeeze(concat_mask,axis=-1)) # R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1)) # concat = layers.merge([L_att, R_att], mode='concat') att = Dropout(0.2)(att) output = layers.Dense(output_shape[0],activation="softmax")(att) model = models.Model(inputs=[L_input,R_input], outputs=output) learn_rate = 0.0001 model.compile(optimizer=optimizers.Adam(lr=learn_rate), loss=losses.binary_crossentropy, metrics=[precision,recall,f1_score]) model.summary() return model class Attention(Layer): """多头注意力机制 """ def __init__(self, nb_head, size_per_head, **kwargs): self.nb_head = nb_head self.size_per_head = size_per_head self.out_dim = nb_head * size_per_head super(Attention, self).__init__(**kwargs) def build(self, input_shape): super(Attention, self).build(input_shape) q_in_dim = input_shape[0][-1] k_in_dim = input_shape[1][-1] v_in_dim = input_shape[2][-1] self.q_kernel = self.add_weight(name='q_kernel', shape=(q_in_dim, self.out_dim), initializer='glorot_normal') self.k_kernel = self.add_weight(name='k_kernel', shape=(k_in_dim, self.out_dim), initializer='glorot_normal') self.v_kernel = self.add_weight(name='w_kernel', shape=(v_in_dim, self.out_dim), initializer='glorot_normal') def mask(self, x, mask, mode='mul'): if mask is None: return x else: for _ in range(K.ndim(x) - K.ndim(mask)): mask = K.expand_dims(mask, K.ndim(mask)) if mode == 'mul': return x * mask else: return x - (1 - mask) * 1e10 def call(self, inputs): q, k, v = inputs[:3] v_mask, q_mask = None, None if len(inputs) > 3: v_mask = inputs[3] if len(inputs) > 4: q_mask = inputs[4] # 线性变换 qw = K.dot(q, self.q_kernel) kw = K.dot(k, self.k_kernel) vw = K.dot(v, self.v_kernel) # 形状变换 qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head)) kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head)) vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head)) # 维度置换 qw = K.permute_dimensions(qw, (0, 2, 1, 3)) kw = K.permute_dimensions(kw, (0, 2, 1, 3)) vw = K.permute_dimensions(vw, (0, 2, 1, 3)) # Attention a = K.batch_dot(qw, kw, [3, 3]) / self.size_per_head**0.5 a = K.permute_dimensions(a, (0, 3, 2, 1)) a = self.mask(a, v_mask, 'add') a = K.permute_dimensions(a, (0, 3, 2, 1)) a = K.softmax(a) # 完成输出 o = K.batch_dot(a, vw, [3, 2]) o = K.permute_dimensions(o, (0, 2, 1, 3)) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.mask(o, q_mask, 'mul') return o def compute_output_shape(self, input_shape): return (input_shape[0][0], input_shape[0][1], self.out_dim) class Attention02(Layer): def __init__(self, **kwargs): self.init = initializers.get('normal') self.supports_masking = True self.attention_dim = 50 super(Attention02, self).__init__(**kwargs) def build(self, input_shape): assert len(input_shape) == 3 self.W = K.variable(self.init((input_shape[-1], 1))) self.b = K.variable(self.init((self.attention_dim,))) self.u = K.variable(self.init((self.attention_dim, 1))) self.trainable_weights = [self.W, self.b, self.u] super(Attention02, self).build(input_shape) def compute_mask(self, inputs, mask=None): return mask def call(self, x, mask=None): uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b)) ait = K.dot(uit, self.u) ait = K.squeeze(ait, -1) ait = K.exp(ait) if mask is not None: ait = ait * K.cast(mask, K.floatx()) # ait = ait * mask ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx()) ait = K.expand_dims(ait) weighted_input = x * ait output = K.sum(weighted_input, axis=1) return output def compute_output_shape(self, input_shape): return (input_shape[0], input_shape[-1]) class OurLayer(Layer): """定义新的Layer,增加reuse方法,允许在定义Layer时调用现成的层 """ def reuse(self, layer, *args, **kwargs): if not layer.built: if len(args) > 0: inputs = args[0] else: inputs = kwargs['inputs'] if isinstance(inputs, list): input_shape = [K.int_shape(x) for x in inputs] else: input_shape = K.int_shape(inputs) layer.build(input_shape) outputs = layer.call(*args, **kwargs) for w in layer.trainable_weights: if w not in self._trainable_weights: self._trainable_weights.append(w) for w in layer.non_trainable_weights: if w not in self._non_trainable_weights: self._non_trainable_weights.append(w) for u in layer.updates: if not hasattr(self, '_updates'): self._updates = [] if u not in self._updates: self._updates.append(u) return outputs class OurBidirectional(OurLayer): """自己封装双向RNN,允许传入mask,保证对齐 """ def __init__(self, layer, **args): super(OurBidirectional, self).__init__(**args) self.forward_layer = layer.__class__.from_config(layer.get_config()) self.backward_layer = layer.__class__.from_config(layer.get_config()) self.forward_layer.name = 'forward_' + self.forward_layer.name self.backward_layer.name = 'backward_' + self.backward_layer.name def reverse_sequence(self, x, mask): """这里的mask.shape是[batch_size, seq_len, 1] """ seq_len = K.round(K.sum(mask, 1)[:, 0]) seq_len = K.cast(seq_len, 'int32') return tf.reverse_sequence(x, seq_len, seq_dim=1) def call(self, inputs): x, mask = inputs x_forward = self.reuse(self.forward_layer, x) x_backward = self.reverse_sequence(x, mask) x_backward = self.reuse(self.backward_layer, x_backward) x_backward = self.reverse_sequence(x_backward, mask) x = K.concatenate([x_forward, x_backward], -1) if K.ndim(x) == 3: return x * mask else: return x def compute_output_shape(self, input_shape): return input_shape[0][:-1] + (self.forward_layer.units * 2,) def training(): data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0) data_load = data_load.reset_index(drop=True) test_data = data_load.sample(frac=0.2, random_state=8) train_data = data_load.drop(test_data.index, axis=0) train_data =train_data.reset_index(drop=True) train_x = [] train_y = [] for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['re_label']): y = np.zeros(output_shape) y[label] = 1 left = str(left) right = str(right) if left=='nan': left = '' if right=='nan': right = '' left = list(left) right = list(right) context = [left, right] x = embedding_word(context, shape=input_shape) train_x.append(x) train_y.append(y) test_x = [] test_y = [] for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['re_label']): y = np.zeros(output_shape) y[label] = 1 left = str(left) right = str(right) if left == 'nan': left = '' if right == 'nan': right = '' left = list(left) right = list(right) context = [left, right] x = embedding_word(context, shape=input_shape) test_x.append(x) test_y.append(y) train_y, test_y = (np.array(train_y), np.array(test_y)) train_x, test_x = (np.array(train_x), np.array(test_x)) train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3))) model = getModel() epochs = 150 batch_size = 256 checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1, save_best_only=True, mode='min') # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1)) # cw = dict(enumerate(cw)) history = model.fit( x=[train_x[0], train_x[1]], y=train_y, validation_data=([test_x[0], test_x[1]], test_y), epochs=epochs, batch_size=batch_size, shuffle=True, callbacks=[checkpoint], class_weight='auto' ) # plot_loss(history=history) load_model = models.load_model("model_label_time_classify.model.hdf5", custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score}) y_pre = load_model.predict([test_x[0], test_x[1]]) # y_pre = load_model.predict(test_x[0]) # 各类别预测评估 res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1)) print(res1) y_pre2 = load_model.predict([train_x[0], train_x[1]]) # y_pre2 = load_model.predict(train_x[0]) res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1)) print(res2) def train2(): data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0) data_load = data_load.reset_index(drop=True) data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']] data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']] test_data = data_load.sample(frac=0.2, random_state=8) train_data = data_load.drop(test_data.index, axis=0) train_data =train_data.reset_index(drop=True) train_x = [] train_y = [] for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['label']): y = np.zeros(output_shape) y[label] = 1 context = [left, right] x = embedding(context, shape=input_shape2) train_x.append(x) train_y.append(y) test_x = [] test_y = [] for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['label']): y = np.zeros(output_shape) y[label] = 1 context = [left, right] x = embedding(context, shape=input_shape2) test_x.append(x) test_y.append(y) train_y, test_y = (np.array(train_y), np.array(test_y)) train_x, test_x = (np.array(train_x), np.array(test_x)) train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3))) model = getModel() epochs = 150 batch_size = 256 checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1, save_best_only=True, mode='min') # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1)) # cw = dict(enumerate(cw)) history = model.fit( x=[train_x[0], train_x[1]], y=train_y, validation_data=([test_x[0], test_x[1]], test_y), epochs=epochs, batch_size=batch_size, shuffle=True, callbacks=[checkpoint], class_weight='auto' ) # plot_loss(history=history) load_model = models.load_model("model_label_time_classify.model.hdf5", custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score}) y_pre = load_model.predict([test_x[0], test_x[1]]) # y_pre = load_model.predict(test_x[0]) # 各类别预测评估 res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1)) print(res1) y_pre2 = load_model.predict([train_x[0], train_x[1]]) # y_pre2 = load_model.predict(train_x[0]) res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1)) print(res2) def train3(): # data_load = pd.read_excel("tokens_tolabel_data1.xlsx", index_col=0) data_load = pd.read_excel("tokens_tolabel_data1_res12.xlsx", index_col=0) # data_load = pd.concat([data_load[data_load['re_label']==0],data_load]) # data_load = data_load[data_load['pre_label_prob']>0.97] # data_load = data_load[data_load['is_same']==1] data_zero = pd.read_excel("tokens_label0_data1.xlsx") # data_old = pd.read_excel("tokens_data_02.xlsx") data_old = pd.read_excel("tokens_data_02_res6.xlsx") data_zero = data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)] # data_zero = pd.concat([data_zero,data_zero]) # data_zero = pd.concat([data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)],data_zero.sample(n=3000)]) # data_zero = data_zero.sample(n=80000) print("输入shape:",input_shape2) data_x = [] data_y = [] for left, right, label,_label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label'], data_load['label']): if label==_label: y = np.zeros(output_shape) y[label] = 1 left = eval(left) left = left[-40:] right = eval(right) right = right[:40] context = [left, right] # x = embedding(context, shape=input_shape2) data_x.append(context) data_y.append(y) data_load2 = data_load[data_load['re_label']==0] for left, right, label,_label in zip(data_load2['context_left'], data_load2['context_right'], data_load2['re_label'], data_load2['label']): if label==_label: y = np.zeros(output_shape) y[label] = 1 left = eval(left) left = left[-40:] if len(left)>30: left = left[2:] elif len(left)>15: left = left[1:] right = eval(right) right = right[:40] if len(right)>15: right = right[:-1] context = [left, right] # x = embedding(context, shape=input_shape2) data_x.append(context) data_y.append(y) for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']): y = np.zeros(output_shape) y[label] = 1 left = eval(left) left = left[-40:] right = eval(right) right = right[:40] context = [left, right] # x = embedding(context, shape=input_shape2) data_x.append(context) data_y.append(y) for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']): y = np.zeros(output_shape) y[label] = 1 left = eval(left) left = left[-40:] if len(left) > 30: left = left[2:] elif len(left) > 15: left = left[1:] right = eval(right) right = right[:40] if len(right) > 15: right = right[:-1] context = [left, right] # x = embedding(context, shape=input_shape2) data_x.append(context) data_y.append(y) # for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']): # y = np.zeros(output_shape) # y[label] = 1 # left = eval(left) # left = left[-40:] # right = eval(right) # right = right[:40] # context = [left, right] # # x = embedding(context, shape=input_shape2) # data_x.append(context) # data_y.append(y) _data = [d for d in zip(data_x,data_y)] import random random.shuffle(_data) data_x = [i[0] for i in _data] data_y = [i[1] for i in _data] test_len = int(len(data_x) * 0.13) test_x = data_x[:test_len] test_y = data_y[:test_len] print("测试数据量:", len(test_x)) train_x = data_x[test_len:] train_y = data_y[test_len:] for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']): y = np.zeros(output_shape) y[label] = 1 left = eval(left) left = left[-40:] right = eval(right) right = right[:40] context = [left, right] # x = embedding(context, shape=input_shape2) train_x.append(context) train_y.append(y) print("训练数据量:", len(train_x)) # train_y, test_y = np.array(train_y), np.array(test_y) # train_x = np.array(train_x) # test_x = np.array(test_x) # test_x = np.transpose(test_x, (1, 0, 2, 3)) # train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3))) training_generator = DataGenerator(train_x, train_y) # training_generator = DataGenerator(data_x, data_y) validation_generator = DataGenerator(test_x, test_y) # model = getModel3() model = getModel2() epochs = 100 # batch_size = 256 checkpoint = ModelCheckpoint("model_time_classify.weights",save_weights_only=True, monitor="val_loss", verbose=1, save_best_only=True, mode='min') # checkpoint = ModelCheckpoint("model_time_classify2.weights",save_weights_only=True, monitor="loss", verbose=1, # save_best_only=True, mode='min') history = model.fit_generator( generator=training_generator, validation_data=validation_generator, use_multiprocessing=True, workers=2, epochs=epochs, shuffle=True, callbacks=[checkpoint], class_weight='auto' ) # plot_loss(history=history) # load_model = models.load_model("model_label_time_classify.model.hdf5", # custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score}) # y_pre = load_model.predict([test_x[0], test_x[1]]) # # y_pre = load_model.predict(test_x[0]) # # 各类别预测评估 # res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1)) # print(res1) # y_pre2 = load_model.predict([train_x[0], train_x[1]]) # # y_pre2 = load_model.predict(train_x[0]) # res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1)) # print(res2) def train4(): # data_load = pd.read_excel("tokens_tolabel_data1.xlsx", index_col=0) data_load = pd.read_excel("tokens_tolabel_data1_res13New.xlsx", index_col=0) # data_load = pd.concat([data_load[data_load['re_label']==0],data_load]) # data_load = data_load[data_load['pre_label_prob']>0.97] # data_load = data_load[data_load['is_same']==1] data_zero = pd.read_excel("time_entity5.xlsx") data_zero = data_zero[(data_zero['viewed']==1)|(data_zero['is_same']==2)] # data_old = pd.read_excel("tokens_data_02.xlsx") data_old = pd.read_excel("tokens_data_02_res7New.xlsx") data_delay1 = pd.read_excel("delayTime_entity1.xlsx") data_delay1 = data_delay1[data_delay1['label']!=0] data_delay2 = pd.read_excel("delayTime_entity2.xlsx") # data_zero = pd.concat([data_zero,data_zero]) # data_zero = pd.concat([data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)],data_zero.sample(n=3000)]) # data_zero = data_zero.sample(n=80000) print("输入shape:",input_shape2) data_x = [] data_y = [] import random for left, right, label,_label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label'], data_load['label']): # if label==_label: y = np.zeros(output_shape) y[label] = 1 left = eval(left) left = left[-40:] right = eval(right) right = right[:40] context = [left, right] # x = embedding(context, shape=input_shape2) data_x.append(context) data_y.append(y) # data_load2 = data_load[data_load['re_label']==0] # for left, right, label,_label in zip(data_load2['context_left'], data_load2['context_right'], data_load2['re_label'], data_load2['label']): # if label==_label: # y = np.zeros(output_shape) # y[label] = 1 # left = eval(left) # left = left[-40:] # if len(left)>30: # left = left[2:] # elif len(left)>15: # left = left[1:] # right = eval(right) # right = right[:40] # if len(right)>15: # right = right[:-1] # context = [left, right] # # x = embedding(context, shape=input_shape2) # data_x.append(context) # data_y.append(y) for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['re_label']): y = np.zeros(output_shape) y[label] = 1 left = eval(left) left = left[-40:] right = eval(right) right = right[:40] context = [left, right] # x = embedding(context, shape=input_shape2) data_x.append(context) data_y.append(y) for left, right, label in zip(data_delay1['context_left'], data_delay1['context_right'], data_delay1['label']): y = np.zeros(output_shape) y[label] = 1 left = eval(left) left = left[-40:] right = eval(right) right = right[:40] context = [left, right] # x = embedding(context, shape=input_shape2) data_x.append(context) data_y.append(y) for left, right, label in zip(data_delay2['context_left'], data_delay2['context_right'], data_delay2['re_label']): y = np.zeros(output_shape) y[label] = 1 left = eval(left) left = left[-40:] right = eval(right) right = right[:40] context = [left, right] # x = embedding(context, shape=input_shape2) data_x.append(context) data_y.append(y) # for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']): # y = np.zeros(output_shape) # y[label] = 1 # left = eval(left) # left = left[-40:] # if len(left) > 30: # left = left[2:] # elif len(left) > 15: # left = left[1:] # right = eval(right) # right = right[:40] # if len(right) > 15: # right = right[:-1] # context = [left, right] # # x = embedding(context, shape=input_shape2) # data_x.append(context) # data_y.append(y) # for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']): # y = np.zeros(output_shape) # y[label] = 1 # left = eval(left) # left = left[-40:] # right = eval(right) # right = right[:40] # context = [left, right] # # x = embedding(context, shape=input_shape2) # data_x.append(context) # data_y.append(y) for left, right, label,pre_label,is_same in zip(data_old['context_left'], data_old['context_right'], data_old['label'], data_old['pre_label'],data_old['is_same']): if label==0: if is_same==1: pass else: if pre_label>3: label = pre_label else: continue y = np.zeros(output_shape) y[label] = 1 left = eval(left) left = left[-40:] right = eval(right) right = right[:40] context = [left, right] # x = embedding(context, shape=input_shape2) data_x.append(context) data_y.append(y) _data = [d for d in zip(data_x,data_y)] random.shuffle(_data) data_x = [i[0] for i in _data] data_y = [i[1] for i in _data] test_len = int(len(data_x) * 0.11) test_x = data_x[:test_len] test_y = data_y[:test_len] print("测试数据量:", len(test_x)) train_x = data_x[test_len:] train_y = data_y[test_len:] # for left, right, label,pre_label,is_same in zip(data_old['context_left'], data_old['context_right'], data_old['label'], # data_old['pre_label'],data_old['is_same']): # # if label==0: # # if random.random()>0.25: # # continue # if label==0: # if is_same==1: # pass # else: # if pre_label>3: # label = pre_label # else: # continue # y = np.zeros(output_shape) # y[label] = 1 # left = eval(left) # left = left[-40:] # right = eval(right) # right = right[:40] # context = [left, right] # # x = embedding(context, shape=input_shape2) # train_x.append(context) # train_y.append(y) print("训练数据量:", len(train_x)) # train_y, test_y = np.array(train_y), np.array(test_y) # train_x = np.array(train_x) # test_x = np.array(test_x) # test_x = np.transpose(test_x, (1, 0, 2, 3)) # train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3))) training_generator = DataGenerator(train_x, train_y,is_train=True) # training_generator = DataGenerator(data_x, data_y) validation_generator = DataGenerator(test_x, test_y,is_train=False,shuffle=False) # model = getModel3() model = getModel2() epochs = 100 # batch_size = 256 checkpoint = ModelCheckpoint("model_time_classify.weights",save_weights_only=True, monitor="val_loss", verbose=1, save_best_only=True, mode='min') # checkpoint = ModelCheckpoint("model_time_classify2.weights",save_weights_only=True, monitor="loss", verbose=1, # save_best_only=True, mode='min') history = model.fit_generator( generator=training_generator, validation_data=validation_generator, use_multiprocessing=True, workers=2, epochs=epochs, shuffle=True, callbacks=[checkpoint], class_weight='auto' ) from keras.utils import Sequence,to_categorical class DataGenerator(Sequence): 'Generates data for Keras' def __init__(self, texts, labels, is_train=True,batch_size=256, n_classes=len(time_label_dict), shuffle=True): 'Initialization' # self.dim = dim self.batch_size = batch_size self.labels = labels self.texts = texts self.n_classes = n_classes self.shuffle = shuffle self.is_train = is_train self.on_epoch_end() def __len__(self): 'Denotes the number of batches per epoch' _len = len(self.texts) // self.batch_size if len(self.texts) % self.batch_size != 0: _len += 1 return _len def __getitem__(self, index): 'Generate one batch of data' # Generate indexes of the batch indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size] # Find list of IDs list_texts = [self.texts[k] for k in indexes] _label = [self.labels[k] for k in indexes] # Generate data X, y = self.__data_generation(list_texts,_label) return X, y def on_epoch_end(self): 'Updates indexes after each epoch' self.indexes = np.arange(len(self.texts)) if self.shuffle == True: np.random.shuffle(self.indexes) def __data_generation(self, list_texts,_label): 'Generates data containing batch_size samples' # Initialization # X = np.empty((self.batch_size, *self.dim)) # y = np.empty((self.batch_size), dtype=int) # batch_len = len(list_texts) # x = np.empty((batch_len, *self.dim)) x = [] # y = np.empty((batch_len), dtype=int) # Generate data for i, context in enumerate(list_texts): # Store sample if self.is_train: left = context[0] if len(left) > 30: if random.random() > 0.5: left = left[2:] elif len(left) > 15: if random.random() > 0.5: left = left[1:] right = context[1] if len(right) > 30: if random.random() > 0.5: right = right[:-2] elif len(right) > 15: if random.random() > 0.5: right = right[:-1] context = [left, right] words_matrix = embedding_mywords(context, shape=input_shape2) # Store class # y[i] = _label[i] x.append(words_matrix) x = np.array(x) x = np.transpose(x, (1, 0, 2, 3)) return [x[0],x[1]], np.array(_label) def predict2(): model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0) data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']] data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']] test_x = [] test_y = [] for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['label']): y = np.zeros(output_shape) y[label] = 1 context = [left, right] x = embedding(context, shape=input_shape2) test_x.append(x) test_y.append(y) test_x = np.transpose(np.array(test_x), (1, 0, 2, 3)) pre_y = model1.predict([test_x[0],test_x[1]]) data_load['pre'] = [np.argmax(item) for item in pre_y] error_data = data_load[data_load['label']!=data_load['pre']] # print(error_data.info()) error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv") def predict3(): data = pd.read_csv("new_tokens_data1.csv", chunksize=5000) model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) new_data = pd.DataFrame() idx = 0 for _data in data: test_x = [] test_y = [] for left, right, label in zip(_data['context_left'], _data['context_right'], _data['label']): left = eval(left) left = left[-10:] right = eval(right) right = right[:10] label = int(label) y = np.zeros(output_shape) y[label] = 1 context = [left, right] x = embedding(context, shape=input_shape2) test_x.append(x) test_y.append(y) test_x = np.transpose(np.array(test_x), (1, 0, 2, 3)) pre_y = model1.predict([test_x[0], test_x[1]]) _data['pre'] = [np.argmax(item) for item in pre_y] _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['label'],_data['pre'])] # data['label'] = label new_data = pd.concat([new_data, _data]) idx += 5000 print(idx) # data.to_csv("new_tokens_data1.csv") new_data.to_excel("new_tokens_data1_res.xlsx") def predict4(): data = pd.read_csv("tokens_data_02_res6New.csv", chunksize=3000) # data = pd.read_excel("C:\\Users\\Administrator\\Desktop\\time_entity4.xlsx") # data.to_csv("C:\\Users\\Administrator\\Desktop\\time_entity4.csv") # data = pd.read_csv("C:\\Users\\Administrator\\Desktop\\time_entity4.csv", chunksize=3000) model1 = getModel2() model1.load_weights("model_time_classify.weights") new_data = pd.DataFrame() idx = 0 for _data in data: test_x = [] test_y = [] for left, right, label in zip(_data['context_left'], _data['context_right'], _data['re_label']): left = eval(left) left = left[-40:] right = eval(right) right = right[:40] label = int(label) y = np.zeros(output_shape) y[label] = 1 context = [left, right] x = embedding_mywords(context, shape=input_shape2) test_x.append(x) test_y.append(y) test_x = np.transpose(np.array(test_x), (1, 0, 2, 3)) pre_y = model1.predict([test_x[0], test_x[1]]) _data['pre_label'] = [np.argmax(item) for item in pre_y] _data['pre_label_prob'] = [max(item) for item in pre_y] _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['label'],_data['pre_label'])] # _data['is_same'] = [1 if int(_re)==int(_pre) and int(_re)==int(_label) else 0 for _label,_re,_pre in zip(_data['label'],_data['re_label'],_data['pre_label'])] # data['label'] = label new_data = pd.concat([new_data, _data]) idx += 3000 print(idx) # new_data.to_csv("tokens_data_02_res7New.csv") new_data.to_excel("tokens_data_02_res7New.xlsx") # new_data.to_excel("C:\\Users\\Administrator\\Desktop\\tokens_data_02_res7New.xlsx") def predict(): model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0) test_x = [] test_y = [] for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']): y = np.zeros(output_shape) y[label] = 1 left = str(left) right = str(right) if left == 'nan': left = '' if right == 'nan': right = '' left = list(left) right = list(right) context = [left, right] x = embedding_word(context, shape=input_shape) test_x.append(x) test_y.append(y) test_x = np.transpose(np.array(test_x), (1, 0, 2, 3)) pre_y = model1.predict([test_x[0],test_x[1]]) data_load['pre'] = [np.argmax(item) for item in pre_y] error_data = data_load[data_load['re_label']!=data_load['pre']] # print(error_data.info()) error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv") def data_process(): data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0) re_left = re.compile("。[^。]*?$") re_right = re.compile("^[^。]*?。") left_list = [] right_list = [] for left, right in zip(data_load['context_left'], data_load['context_right']): left = str(left) right = str(right) if right=='nan': right = '' # print(1) if re.search("。",left): left = re_left.search(left) left = left.group()[1:] if re.search("。",right): right = re_right.search(right) right = right.group() left_list.append(left) right_list.append(right) data_load['context_left'] = left_list data_load['context_right'] = right_list data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv") def data_process2(): data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0) left_list = [] right_list = [] for left, right in zip(data_load['context_left'], data_load['context_right']): left = str(left) right = str(right) if right=='nan': right = '' if left=='nan': left = '' left = left[max(len(left)-20,0):] right = right[:20] left_list.append(left) right_list.append(right) data_load['context_left'] = left_list data_load['context_right'] = right_list data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_20_prc.csv") def data_process3(): data = load('db_time_data.pk') data = data.drop('value', axis=1) token_begin = [] token_end = [] context_left = [] context_right = [] data2 = pd.read_csv("newdata_30_prc2.csv") label = [] # data=data[:20] for id,sentences,tokens,offset,begin,end,entity_text in zip(data['document_id'],data['sentences'],data['tokens'],data['offsets_to_text'], data['begin_index'],data['end_index'],data['entity_text']): _label = data2[(data2['document_id']==int(id)) & (data2['begin_index']==int(begin))][:1] if not _label.empty: _label = int(_label['re_label']) else: _label=0 label.append(_label) begin = int(begin) end = int(end) entity_tbegin = 0 entity_tend = 0 find_begin = False for t in range(len(offset)): if not find_begin: if offset[t]==begin: entity_tbegin = t find_begin = True if offset[t]>begin: entity_tbegin = t-1 find_begin = True if offset[t] >= end: entity_tend = t break token_begin.append(entity_tbegin) token_end.append(entity_tend) s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend-1,size=40) s1 = s[0] _temp1 = [] for i in range(len(s1)): if s1[i]=="。": _temp1.append(i) if _temp1: s1 = s1[_temp1[-1]+1:] s2 = s[1] _temp2 = [] for i in range(len(s2)): if s2[i] == "。": _temp2.append(i) break if _temp2: s2 = s2[:_temp2[0]+1] # print(s2) context_left.append(s1) context_right.append(s2) print(id) # print(_label) # print(entity_text) # print(tokens[entity_tbegin:entity_tend]) data['token_begin'] = token_begin data['token_end'] = token_end data['context_left'] = context_left data['context_right'] = context_right data['label'] = label data = data.drop(['tokens','offsets_to_text','sentences'],axis=1) # data.to_csv("tokens_data_02.csv") data.to_excel("tokens_data_02.xlsx") def plot_loss(history): plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show() def embedding_mywords(datas,shape): ''' @summary:查找词汇对应的词向量 @param: datas:词汇的list shape:结果的shape @return: array,返回对应shape的词嵌入 ''' model_w2v = getModel_w2v() embed = np.zeros(shape) length = shape[1] out_index = 0 #print(datas) for data in datas: index = 0 for item in data: item_not_space = re.sub("\s*","",item) if index>=length: break if item_not_space in model_w2v.vocab: embed[out_index][index] = model_w2v[item_not_space] index += 1 else: embed[out_index][index] = model_w2v['unk'] index += 1 out_index += 1 return embed def save_model(): graph = tf.Graph() with graph.as_default() as graph: with tf.Session(graph=graph).as_default() as sess: test_model = getModel2() test_model.load_weights("model_time_classify.weights") tf.saved_model.simple_save(sess, "models/timesplit_model2/", inputs={"input0": test_model.input[0], "input1":test_model.input[1] }, outputs={"outputs": test_model.output}) if __name__ == '__main__': # get_data() # getModel() # getModel2() # getModel3() # training() # train2() # train3() # train4() # data_process() # data_process2() # data_process3() # predict() # predict2() # predict3() # predict4() save_model() pass