|
@@ -1,11 +1,13 @@
|
|
|
import sys
|
|
|
import os
|
|
|
sys.path.append(os.path.abspath("../.."))
|
|
|
+# sys.path.append('/data/python_znj/znj/BIDI_ML_INFO_EXTRACTION/')
|
|
|
import pandas as pd
|
|
|
import re
|
|
|
import psycopg2
|
|
|
from keras.callbacks import ModelCheckpoint
|
|
|
from keras import layers,models,optimizers,losses
|
|
|
+from keras.layers import *
|
|
|
from BiddingKG.dl.common.Utils import *
|
|
|
from BiddingKG.dl.common.models import *
|
|
|
from sklearn.metrics import classification_report
|
|
@@ -13,15 +15,15 @@ from sklearn.utils import shuffle,class_weight
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
input_shape = (2,30,60)
|
|
|
-input_shape2 = (2,10,128)
|
|
|
+input_shape2 = (2,40,128)
|
|
|
output_shape = [4]
|
|
|
|
|
|
def get_data():
|
|
|
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
|
|
|
+ data_load = pd.read_csv("newdata_30_prc.csv", index_col=0)
|
|
|
id_set = set()
|
|
|
for id in data_load['document_id']:
|
|
|
id_set.add(id)
|
|
|
- conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.101")
|
|
|
+ conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.103")
|
|
|
sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value " \
|
|
|
"FROM corpus_iedocument A,brat_bratannotation B " \
|
|
|
"WHERE A.human_identifier = '%s' " \
|
|
@@ -47,10 +49,12 @@ def get_data():
|
|
|
df = pd.concat([df, time_label], axis=1)
|
|
|
print(df.info())
|
|
|
df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']]
|
|
|
- df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
|
|
|
- df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
|
|
|
- df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
|
|
|
- df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
|
|
|
+ df['sentences'] = [eval(sentence) for sentence in df['sentences']]
|
|
|
+ # df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
|
|
|
+ # df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
|
|
|
+ df['offsets_to_text'] = [eval(offset) for offset in df['offsets_to_text']]
|
|
|
+ # df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
|
|
|
+ # df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
|
|
|
save(df,'db_time_data.pk')
|
|
|
|
|
|
def getModel():
|
|
@@ -78,6 +82,163 @@ def getModel():
|
|
|
model.summary()
|
|
|
return model
|
|
|
|
|
|
+def getModel2():
|
|
|
+ '''
|
|
|
+ @summary: 时间分类模型
|
|
|
+ '''
|
|
|
+ L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
|
|
|
+ L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
|
|
|
+ R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
|
|
|
+ R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
|
|
|
+
|
|
|
+ L_input_drop = Dropout(0.2)(L_input)
|
|
|
+ R_input_drop = Dropout(0.2)(R_input)
|
|
|
+ # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
|
|
|
+ L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
|
|
|
+ L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
|
|
|
+ # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
|
|
|
+ R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
|
|
|
+ R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
|
|
|
+ concat = layers.merge([L_att, R_att], mode='concat')
|
|
|
+ concat = Dropout(0.3)(concat)
|
|
|
+ output = layers.Dense(output_shape[0],activation="softmax")(concat)
|
|
|
+
|
|
|
+ model = models.Model(inputs=[L_input,R_input], outputs=output)
|
|
|
+
|
|
|
+ learn_rate = 0.00005
|
|
|
+ model.compile(optimizer=optimizers.Adam(lr=learn_rate),
|
|
|
+ loss=losses.binary_crossentropy,
|
|
|
+ metrics=[precision,recall,f1_score])
|
|
|
+ model.summary()
|
|
|
+ return model
|
|
|
+
|
|
|
+def getModel3():
|
|
|
+ '''
|
|
|
+ @summary: 时间分类模型
|
|
|
+ '''
|
|
|
+ L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
|
|
|
+ L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
|
|
|
+ R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
|
|
|
+ R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
|
|
|
+
|
|
|
+ L_input_drop = Dropout(0.2)(L_input)
|
|
|
+ R_input_drop = Dropout(0.2)(R_input)
|
|
|
+ # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
|
|
|
+ L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
|
|
|
+ # L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
|
|
|
+ # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
|
|
|
+ R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
|
|
|
+ concat = layers.merge([L_lstm,R_lstm], mode='concat',concat_axis=1)
|
|
|
+ concat_mask = layers.merge([L_mask,R_mask], mode='concat',concat_axis=1)
|
|
|
+ att = Attention02()(concat,mask=K.squeeze(concat_mask,axis=-1))
|
|
|
+ # R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
|
|
|
+ # concat = layers.merge([L_att, R_att], mode='concat')
|
|
|
+ att = Dropout(0.3)(att)
|
|
|
+ output = layers.Dense(output_shape[0],activation="softmax")(att)
|
|
|
+
|
|
|
+ model = models.Model(inputs=[L_input,R_input], outputs=output)
|
|
|
+
|
|
|
+ learn_rate = 0.0001
|
|
|
+ model.compile(optimizer=optimizers.Adam(lr=learn_rate),
|
|
|
+ loss=losses.binary_crossentropy,
|
|
|
+ metrics=[precision,recall,f1_score])
|
|
|
+ model.summary()
|
|
|
+ return model
|
|
|
+
|
|
|
+class Attention02(Layer):
|
|
|
+ def __init__(self, **kwargs):
|
|
|
+ self.init = initializers.get('normal')
|
|
|
+ self.supports_masking = True
|
|
|
+ self.attention_dim = 50
|
|
|
+ super(Attention02, self).__init__(**kwargs)
|
|
|
+
|
|
|
+ def build(self, input_shape):
|
|
|
+ assert len(input_shape) == 3
|
|
|
+ self.W = K.variable(self.init((input_shape[-1], 1)))
|
|
|
+ self.b = K.variable(self.init((self.attention_dim,)))
|
|
|
+ self.u = K.variable(self.init((self.attention_dim, 1)))
|
|
|
+ self.trainable_weights = [self.W, self.b, self.u]
|
|
|
+ super(Attention02, self).build(input_shape)
|
|
|
+
|
|
|
+ def compute_mask(self, inputs, mask=None):
|
|
|
+ return mask
|
|
|
+
|
|
|
+ def call(self, x, mask=None):
|
|
|
+ uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
|
|
|
+ ait = K.dot(uit, self.u)
|
|
|
+ ait = K.squeeze(ait, -1)
|
|
|
+ ait = K.exp(ait)
|
|
|
+
|
|
|
+ if mask is not None:
|
|
|
+ ait = ait * K.cast(mask, K.floatx())
|
|
|
+ # ait = ait * mask
|
|
|
+
|
|
|
+ ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
|
|
|
+ ait = K.expand_dims(ait)
|
|
|
+ weighted_input = x * ait
|
|
|
+ output = K.sum(weighted_input, axis=1)
|
|
|
+ return output
|
|
|
+
|
|
|
+ def compute_output_shape(self, input_shape):
|
|
|
+ return (input_shape[0], input_shape[-1])
|
|
|
+
|
|
|
+class OurLayer(Layer):
|
|
|
+ """定义新的Layer,增加reuse方法,允许在定义Layer时调用现成的层
|
|
|
+ """
|
|
|
+ def reuse(self, layer, *args, **kwargs):
|
|
|
+ if not layer.built:
|
|
|
+ if len(args) > 0:
|
|
|
+ inputs = args[0]
|
|
|
+ else:
|
|
|
+ inputs = kwargs['inputs']
|
|
|
+ if isinstance(inputs, list):
|
|
|
+ input_shape = [K.int_shape(x) for x in inputs]
|
|
|
+ else:
|
|
|
+ input_shape = K.int_shape(inputs)
|
|
|
+ layer.build(input_shape)
|
|
|
+ outputs = layer.call(*args, **kwargs)
|
|
|
+ for w in layer.trainable_weights:
|
|
|
+ if w not in self._trainable_weights:
|
|
|
+ self._trainable_weights.append(w)
|
|
|
+ for w in layer.non_trainable_weights:
|
|
|
+ if w not in self._non_trainable_weights:
|
|
|
+ self._non_trainable_weights.append(w)
|
|
|
+ for u in layer.updates:
|
|
|
+ if not hasattr(self, '_updates'):
|
|
|
+ self._updates = []
|
|
|
+ if u not in self._updates:
|
|
|
+ self._updates.append(u)
|
|
|
+ return outputs
|
|
|
+class OurBidirectional(OurLayer):
|
|
|
+ """自己封装双向RNN,允许传入mask,保证对齐
|
|
|
+ """
|
|
|
+ def __init__(self, layer, **args):
|
|
|
+ super(OurBidirectional, self).__init__(**args)
|
|
|
+ self.forward_layer = layer.__class__.from_config(layer.get_config())
|
|
|
+ self.backward_layer = layer.__class__.from_config(layer.get_config())
|
|
|
+ self.forward_layer.name = 'forward_' + self.forward_layer.name
|
|
|
+ self.backward_layer.name = 'backward_' + self.backward_layer.name
|
|
|
+ def reverse_sequence(self, x, mask):
|
|
|
+ """这里的mask.shape是[batch_size, seq_len, 1]
|
|
|
+ """
|
|
|
+ seq_len = K.round(K.sum(mask, 1)[:, 0])
|
|
|
+ seq_len = K.cast(seq_len, 'int32')
|
|
|
+ return tf.reverse_sequence(x, seq_len, seq_dim=1)
|
|
|
+ def call(self, inputs):
|
|
|
+ x, mask = inputs
|
|
|
+ x_forward = self.reuse(self.forward_layer, x)
|
|
|
+ x_backward = self.reverse_sequence(x, mask)
|
|
|
+ x_backward = self.reuse(self.backward_layer, x_backward)
|
|
|
+ x_backward = self.reverse_sequence(x_backward, mask)
|
|
|
+ x = K.concatenate([x_forward, x_backward], -1)
|
|
|
+ if K.ndim(x) == 3:
|
|
|
+ return x * mask
|
|
|
+ else:
|
|
|
+ return x
|
|
|
+ def compute_output_shape(self, input_shape):
|
|
|
+ return input_shape[0][:-1] + (self.forward_layer.units * 2,)
|
|
|
+
|
|
|
+
|
|
|
|
|
|
def training():
|
|
|
data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
|
|
@@ -215,6 +376,222 @@ def train2():
|
|
|
res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
|
|
|
print(res2)
|
|
|
|
|
|
+def train3():
|
|
|
+ # data_load = pd.read_excel("tokens_tolabel_data1.xlsx", index_col=0)
|
|
|
+ data_load = pd.read_excel("tokens_tolabel_data1_res12.xlsx", index_col=0)
|
|
|
+ # data_load = pd.concat([data_load[data_load['re_label']==0],data_load])
|
|
|
+ # data_load = data_load[data_load['pre_label_prob']>0.97]
|
|
|
+ # data_load = data_load[data_load['is_same']==1]
|
|
|
+ data_zero = pd.read_excel("tokens_label0_data1.xlsx")
|
|
|
+ # data_old = pd.read_excel("tokens_data_02.xlsx")
|
|
|
+ data_old = pd.read_excel("tokens_data_02_res6.xlsx")
|
|
|
+ data_zero = data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)]
|
|
|
+ # data_zero = pd.concat([data_zero,data_zero])
|
|
|
+ # data_zero = pd.concat([data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)],data_zero.sample(n=3000)])
|
|
|
+ # data_zero = data_zero.sample(n=80000)
|
|
|
+ print("输入shape:",input_shape2)
|
|
|
+ data_x = []
|
|
|
+ data_y = []
|
|
|
+ for left, right, label,_label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label'], data_load['label']):
|
|
|
+ if label==_label:
|
|
|
+ y = np.zeros(output_shape)
|
|
|
+ y[label] = 1
|
|
|
+ left = eval(left)
|
|
|
+ left = left[-40:]
|
|
|
+ right = eval(right)
|
|
|
+ right = right[:40]
|
|
|
+ context = [left, right]
|
|
|
+ # x = embedding(context, shape=input_shape2)
|
|
|
+ data_x.append(context)
|
|
|
+ data_y.append(y)
|
|
|
+ data_load2 = data_load[data_load['re_label']==0]
|
|
|
+ for left, right, label,_label in zip(data_load2['context_left'], data_load2['context_right'], data_load2['re_label'], data_load2['label']):
|
|
|
+ if label==_label:
|
|
|
+ y = np.zeros(output_shape)
|
|
|
+ y[label] = 1
|
|
|
+ left = eval(left)
|
|
|
+ left = left[-40:]
|
|
|
+ if len(left)>30:
|
|
|
+ left = left[2:]
|
|
|
+ elif len(left)>15:
|
|
|
+ left = left[1:]
|
|
|
+ right = eval(right)
|
|
|
+ right = right[:40]
|
|
|
+ if len(right)>15:
|
|
|
+ right = right[:-1]
|
|
|
+ context = [left, right]
|
|
|
+ # x = embedding(context, shape=input_shape2)
|
|
|
+ data_x.append(context)
|
|
|
+ data_y.append(y)
|
|
|
+
|
|
|
+ for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
|
|
|
+ y = np.zeros(output_shape)
|
|
|
+ y[label] = 1
|
|
|
+ left = eval(left)
|
|
|
+ left = left[-40:]
|
|
|
+ right = eval(right)
|
|
|
+ right = right[:40]
|
|
|
+ context = [left, right]
|
|
|
+ # x = embedding(context, shape=input_shape2)
|
|
|
+ data_x.append(context)
|
|
|
+ data_y.append(y)
|
|
|
+
|
|
|
+ for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
|
|
|
+ y = np.zeros(output_shape)
|
|
|
+ y[label] = 1
|
|
|
+ left = eval(left)
|
|
|
+ left = left[-40:]
|
|
|
+ if len(left) > 30:
|
|
|
+ left = left[2:]
|
|
|
+ elif len(left) > 15:
|
|
|
+ left = left[1:]
|
|
|
+ right = eval(right)
|
|
|
+ right = right[:40]
|
|
|
+ if len(right) > 15:
|
|
|
+ right = right[:-1]
|
|
|
+ context = [left, right]
|
|
|
+ # x = embedding(context, shape=input_shape2)
|
|
|
+ data_x.append(context)
|
|
|
+ data_y.append(y)
|
|
|
+
|
|
|
+ # for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
|
|
|
+ # y = np.zeros(output_shape)
|
|
|
+ # y[label] = 1
|
|
|
+ # left = eval(left)
|
|
|
+ # left = left[-40:]
|
|
|
+ # right = eval(right)
|
|
|
+ # right = right[:40]
|
|
|
+ # context = [left, right]
|
|
|
+ # # x = embedding(context, shape=input_shape2)
|
|
|
+ # data_x.append(context)
|
|
|
+ # data_y.append(y)
|
|
|
+
|
|
|
+ _data = [d for d in zip(data_x,data_y)]
|
|
|
+ import random
|
|
|
+ random.shuffle(_data)
|
|
|
+ data_x = [i[0] for i in _data]
|
|
|
+ data_y = [i[1] for i in _data]
|
|
|
+ test_len = int(len(data_x) * 0.13)
|
|
|
+ test_x = data_x[:test_len]
|
|
|
+ test_y = data_y[:test_len]
|
|
|
+ print("测试数据量:", len(test_x))
|
|
|
+ train_x = data_x[test_len:]
|
|
|
+ train_y = data_y[test_len:]
|
|
|
+
|
|
|
+ for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
|
|
|
+ y = np.zeros(output_shape)
|
|
|
+ y[label] = 1
|
|
|
+ left = eval(left)
|
|
|
+ left = left[-40:]
|
|
|
+ right = eval(right)
|
|
|
+ right = right[:40]
|
|
|
+ context = [left, right]
|
|
|
+ # x = embedding(context, shape=input_shape2)
|
|
|
+ train_x.append(context)
|
|
|
+ train_y.append(y)
|
|
|
+ print("训练数据量:", len(train_x))
|
|
|
+
|
|
|
+ # train_y, test_y = np.array(train_y), np.array(test_y)
|
|
|
+ # train_x = np.array(train_x)
|
|
|
+ # test_x = np.array(test_x)
|
|
|
+ # test_x = np.transpose(test_x, (1, 0, 2, 3))
|
|
|
+ # train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
|
|
|
+ training_generator = DataGenerator(train_x, train_y)
|
|
|
+ # training_generator = DataGenerator(data_x, data_y)
|
|
|
+ validation_generator = DataGenerator(test_x, test_y)
|
|
|
+
|
|
|
+ # model = getModel3()
|
|
|
+ model = getModel2()
|
|
|
+ epochs = 100
|
|
|
+ # batch_size = 256
|
|
|
+ checkpoint = ModelCheckpoint("model_time_classify.weights",save_weights_only=True, monitor="val_loss", verbose=1,
|
|
|
+ save_best_only=True, mode='min')
|
|
|
+ # checkpoint = ModelCheckpoint("model_time_classify2.weights",save_weights_only=True, monitor="loss", verbose=1,
|
|
|
+ # save_best_only=True, mode='min')
|
|
|
+
|
|
|
+ history = model.fit_generator(
|
|
|
+ generator=training_generator,
|
|
|
+ validation_data=validation_generator,
|
|
|
+ use_multiprocessing=True, workers=2,
|
|
|
+ epochs=epochs,
|
|
|
+ shuffle=True,
|
|
|
+ callbacks=[checkpoint],
|
|
|
+ class_weight='auto'
|
|
|
+ )
|
|
|
+ # plot_loss(history=history)
|
|
|
+ # load_model = models.load_model("model_label_time_classify.model.hdf5",
|
|
|
+ # custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
|
|
|
+ # y_pre = load_model.predict([test_x[0], test_x[1]])
|
|
|
+ # # y_pre = load_model.predict(test_x[0])
|
|
|
+ # # 各类别预测评估
|
|
|
+ # res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
|
|
|
+ # print(res1)
|
|
|
+ # y_pre2 = load_model.predict([train_x[0], train_x[1]])
|
|
|
+ # # y_pre2 = load_model.predict(train_x[0])
|
|
|
+ # res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
|
|
|
+ # print(res2)
|
|
|
+from keras.utils import Sequence,to_categorical
|
|
|
+class DataGenerator(Sequence):
|
|
|
+ 'Generates data for Keras'
|
|
|
+ def __init__(self, texts, labels, batch_size=256,
|
|
|
+ n_classes=4, shuffle=True):
|
|
|
+ 'Initialization'
|
|
|
+ # self.dim = dim
|
|
|
+ self.batch_size = batch_size
|
|
|
+ self.labels = labels
|
|
|
+ self.texts = texts
|
|
|
+ self.n_classes = n_classes
|
|
|
+ self.shuffle = shuffle
|
|
|
+ self.on_epoch_end()
|
|
|
+
|
|
|
+ def __len__(self):
|
|
|
+ 'Denotes the number of batches per epoch'
|
|
|
+ _len = len(self.texts) // self.batch_size
|
|
|
+ if len(self.texts) % self.batch_size != 0:
|
|
|
+ _len += 1
|
|
|
+ return _len
|
|
|
+
|
|
|
+ def __getitem__(self, index):
|
|
|
+ 'Generate one batch of data'
|
|
|
+ # Generate indexes of the batch
|
|
|
+ indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
|
|
|
+
|
|
|
+ # Find list of IDs
|
|
|
+ list_texts = [self.texts[k] for k in indexes]
|
|
|
+ _label = [self.labels[k] for k in indexes]
|
|
|
+ # Generate data
|
|
|
+ X, y = self.__data_generation(list_texts,_label)
|
|
|
+
|
|
|
+ return X, y
|
|
|
+
|
|
|
+ def on_epoch_end(self):
|
|
|
+ 'Updates indexes after each epoch'
|
|
|
+ self.indexes = np.arange(len(self.texts))
|
|
|
+ if self.shuffle == True:
|
|
|
+ np.random.shuffle(self.indexes)
|
|
|
+
|
|
|
+ def __data_generation(self, list_texts,_label):
|
|
|
+ 'Generates data containing batch_size samples'
|
|
|
+ # Initialization
|
|
|
+ # X = np.empty((self.batch_size, *self.dim))
|
|
|
+ # y = np.empty((self.batch_size), dtype=int)
|
|
|
+ # batch_len = len(list_texts)
|
|
|
+ # x = np.empty((batch_len, *self.dim))
|
|
|
+ x = []
|
|
|
+ # y = np.empty((batch_len), dtype=int)
|
|
|
+
|
|
|
+ # Generate data
|
|
|
+ for i, context in enumerate(list_texts):
|
|
|
+ # Store sample
|
|
|
+ # tokens = preprocess2(text)
|
|
|
+ # tokens = tokens[:maxlen]
|
|
|
+ words_matrix = embedding_mywords(context, shape=input_shape2)
|
|
|
+ # Store class
|
|
|
+ # y[i] = _label[i]
|
|
|
+ x.append(words_matrix)
|
|
|
+ x = np.array(x)
|
|
|
+ x = np.transpose(x, (1, 0, 2, 3))
|
|
|
+ return [x[0],x[1]], np.array(_label)
|
|
|
|
|
|
def predict2():
|
|
|
model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
|
|
@@ -237,6 +614,73 @@ def predict2():
|
|
|
# print(error_data.info())
|
|
|
error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
|
|
|
|
|
|
+def predict3():
|
|
|
+ data = pd.read_csv("new_tokens_data1.csv", chunksize=5000)
|
|
|
+ model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
|
|
|
+ new_data = pd.DataFrame()
|
|
|
+ idx = 0
|
|
|
+ for _data in data:
|
|
|
+
|
|
|
+ test_x = []
|
|
|
+ test_y = []
|
|
|
+ for left, right, label in zip(_data['context_left'], _data['context_right'], _data['label']):
|
|
|
+ left = eval(left)
|
|
|
+ left = left[-10:]
|
|
|
+ right = eval(right)
|
|
|
+ right = right[:10]
|
|
|
+ label = int(label)
|
|
|
+ y = np.zeros(output_shape)
|
|
|
+ y[label] = 1
|
|
|
+ context = [left, right]
|
|
|
+ x = embedding(context, shape=input_shape2)
|
|
|
+ test_x.append(x)
|
|
|
+ test_y.append(y)
|
|
|
+ test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
|
|
|
+ pre_y = model1.predict([test_x[0], test_x[1]])
|
|
|
+ _data['pre'] = [np.argmax(item) for item in pre_y]
|
|
|
+ _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['label'],_data['pre'])]
|
|
|
+ # data['label'] = label
|
|
|
+ new_data = pd.concat([new_data, _data])
|
|
|
+ idx += 5000
|
|
|
+ print(idx)
|
|
|
+ # data.to_csv("new_tokens_data1.csv")
|
|
|
+ new_data.to_excel("new_tokens_data1_res.xlsx")
|
|
|
+
|
|
|
+def predict4():
|
|
|
+ data = pd.read_csv("tokens_tolabel_data1_res11.csv", chunksize=3000)
|
|
|
+ model1 = getModel2()
|
|
|
+ model1.load_weights("model_time_classify.weights")
|
|
|
+ new_data = pd.DataFrame()
|
|
|
+ idx = 0
|
|
|
+ for _data in data:
|
|
|
+ test_x = []
|
|
|
+ test_y = []
|
|
|
+ for left, right, label in zip(_data['context_left'], _data['context_right'], _data['re_label']):
|
|
|
+ left = eval(left)
|
|
|
+ left = left[-40:]
|
|
|
+ right = eval(right)
|
|
|
+ right = right[:40]
|
|
|
+ label = int(label)
|
|
|
+ y = np.zeros(output_shape)
|
|
|
+ y[label] = 1
|
|
|
+ context = [left, right]
|
|
|
+ x = embedding_mywords(context, shape=input_shape2)
|
|
|
+ test_x.append(x)
|
|
|
+ test_y.append(y)
|
|
|
+ test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
|
|
|
+ pre_y = model1.predict([test_x[0], test_x[1]])
|
|
|
+ _data['pre_label'] = [np.argmax(item) for item in pre_y]
|
|
|
+ _data['pre_label_prob'] = [max(item) for item in pre_y]
|
|
|
+ _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['re_label'],_data['pre_label'])]
|
|
|
+ # _data['is_same'] = [1 if int(_re)==int(_pre) and int(_re)==int(_label) else 0 for _label,_re,_pre in zip(_data['label'],_data['re_label'],_data['pre_label'])]
|
|
|
+ # data['label'] = label
|
|
|
+ new_data = pd.concat([new_data, _data])
|
|
|
+ idx += 3000
|
|
|
+ print(idx)
|
|
|
+ # data.to_csv("new_tokens_data1.csv")
|
|
|
+ new_data.to_excel("tokens_tolabel_data1_res12.xlsx")
|
|
|
+
|
|
|
+
|
|
|
def predict():
|
|
|
model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
|
|
|
data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
|
|
@@ -313,7 +757,7 @@ def data_process3():
|
|
|
token_end = []
|
|
|
context_left = []
|
|
|
context_right = []
|
|
|
- data2 = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc2.csv")
|
|
|
+ data2 = pd.read_csv("newdata_30_prc2.csv")
|
|
|
label = []
|
|
|
# data=data[:20]
|
|
|
for id,sentences,tokens,offset,begin,end,entity_text in zip(data['document_id'],data['sentences'],data['tokens'],data['offsets_to_text'],
|
|
@@ -343,7 +787,7 @@ def data_process3():
|
|
|
break
|
|
|
token_begin.append(entity_tbegin)
|
|
|
token_end.append(entity_tend)
|
|
|
- s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend,size=10)
|
|
|
+ s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend-1,size=40)
|
|
|
s1 = s[0]
|
|
|
_temp1 = []
|
|
|
for i in range(len(s1)):
|
|
@@ -372,7 +816,8 @@ def data_process3():
|
|
|
data['context_right'] = context_right
|
|
|
data['label'] = label
|
|
|
data = data.drop(['tokens','offsets_to_text','sentences'],axis=1)
|
|
|
- data.to_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv")
|
|
|
+ # data.to_csv("tokens_data_02.csv")
|
|
|
+ data.to_excel("tokens_data_02.xlsx")
|
|
|
|
|
|
def plot_loss(history):
|
|
|
plt.plot(history.history['loss'])
|
|
@@ -383,15 +828,64 @@ def plot_loss(history):
|
|
|
plt.legend(['Train', 'Test'], loc='upper left')
|
|
|
plt.show()
|
|
|
|
|
|
+def embedding_mywords(datas,shape):
|
|
|
+ '''
|
|
|
+ @summary:查找词汇对应的词向量
|
|
|
+ @param:
|
|
|
+ datas:词汇的list
|
|
|
+ shape:结果的shape
|
|
|
+ @return: array,返回对应shape的词嵌入
|
|
|
+ '''
|
|
|
+ model_w2v = getModel_w2v()
|
|
|
+ embed = np.zeros(shape)
|
|
|
+ length = shape[1]
|
|
|
+ out_index = 0
|
|
|
+ #print(datas)
|
|
|
+ for data in datas:
|
|
|
+ index = 0
|
|
|
+ for item in data:
|
|
|
+ item_not_space = re.sub("\s*","",item)
|
|
|
+ if index>=length:
|
|
|
+ break
|
|
|
+ if item_not_space in model_w2v.vocab:
|
|
|
+ embed[out_index][index] = model_w2v[item_not_space]
|
|
|
+ index += 1
|
|
|
+ else:
|
|
|
+ embed[out_index][index] = model_w2v['unk']
|
|
|
+ index += 1
|
|
|
+ out_index += 1
|
|
|
+ return embed
|
|
|
+
|
|
|
+def save_model():
|
|
|
+ graph = tf.Graph()
|
|
|
+ with graph.as_default() as graph:
|
|
|
+ with tf.Session(graph=graph).as_default() as sess:
|
|
|
+ test_model = getModel2()
|
|
|
+ test_model.load_weights("model_time_classify.weights")
|
|
|
+ tf.saved_model.simple_save(sess,
|
|
|
+ "models/timesplit_model/",
|
|
|
+ inputs={"input0": test_model.input[0],
|
|
|
+ "input1":test_model.input[1]
|
|
|
+ },
|
|
|
+ outputs={"outputs": test_model.output})
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
if __name__ == '__main__':
|
|
|
# get_data()
|
|
|
# getModel()
|
|
|
+ # getModel2()
|
|
|
+ # getModel3()
|
|
|
# training()
|
|
|
# train2()
|
|
|
+ # train3()
|
|
|
# data_process()
|
|
|
# data_process2()
|
|
|
# data_process3()
|
|
|
# predict()
|
|
|
# predict2()
|
|
|
+ # predict3()
|
|
|
+ # predict4()
|
|
|
+ save_model()
|
|
|
|
|
|
pass
|