|
@@ -13,10 +13,32 @@ from BiddingKG.dl.common.models import *
|
|
from sklearn.metrics import classification_report
|
|
from sklearn.metrics import classification_report
|
|
from sklearn.utils import shuffle,class_weight
|
|
from sklearn.utils import shuffle,class_weight
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.pyplot as plt
|
|
|
|
+import random
|
|
|
|
|
|
input_shape = (2,30,60)
|
|
input_shape = (2,30,60)
|
|
input_shape2 = (2,40,128)
|
|
input_shape2 = (2,40,128)
|
|
-output_shape = [4]
|
|
|
|
|
|
+# output_shape = [4]
|
|
|
|
+
|
|
|
|
+time_label_dict = {
|
|
|
|
+ 'time': 0,
|
|
|
|
+ 'time_release': 1, #发布时间
|
|
|
|
+ 'time_bidopen': 2, #开标时间
|
|
|
|
+ 'time_bidclose': 3, #截标时间
|
|
|
|
+ 'time_bidstart': 12, #投标(开始)时间、响应文件接收(开始)时间
|
|
|
|
+
|
|
|
|
+ 'time_publicityStart': 4, #公示开始时间(公示时间、公示期)
|
|
|
|
+ 'time_publicityEnd': 5, #公示截止时间
|
|
|
|
+ 'time_getFileStart': 6, #文件获取开始时间(文件获取时间)
|
|
|
|
+ 'time_getFileEnd': 7, #文件获取截止时间
|
|
|
|
+ 'time_registrationStart': 8, #报名开始时间(报名时间)
|
|
|
|
+ 'time_registrationEnd': 9, #报名截止时间
|
|
|
|
+ 'time_earnestMoneyStart': 10, #保证金递交开始时间(保证金递交时间)
|
|
|
|
+ 'time_earnestMoneyEnd': 11, #保证金递交截止时间
|
|
|
|
+ 'time_commencement': 13, #开工日期
|
|
|
|
+ 'time_completion': 14 #竣工日期
|
|
|
|
+ }
|
|
|
|
+output_shape = [len(time_label_dict)]
|
|
|
|
+
|
|
|
|
|
|
def get_data():
|
|
def get_data():
|
|
data_load = pd.read_csv("newdata_30_prc.csv", index_col=0)
|
|
data_load = pd.read_csv("newdata_30_prc.csv", index_col=0)
|
|
@@ -91,16 +113,23 @@ def getModel2():
|
|
R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
|
|
R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
|
|
R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
|
|
R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
|
|
|
|
|
|
- L_input_drop = Dropout(0.2)(L_input)
|
|
|
|
- R_input_drop = Dropout(0.2)(R_input)
|
|
|
|
|
|
+ L_input_drop = Dropout(0.3)(L_input)
|
|
|
|
+ R_input_drop = Dropout(0.3)(R_input)
|
|
# L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
|
|
# L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
|
|
L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
|
|
L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
|
|
L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
|
|
L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
|
|
# R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
|
|
# R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
|
|
R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
|
|
R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
|
|
R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
|
|
R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
|
|
|
|
+ L_R = layers.merge([L_lstm, R_lstm],concat_axis=1, mode='concat')
|
|
|
|
+ L_R_mask = layers.merge([L_mask, R_mask],concat_axis=1, mode='concat')
|
|
|
|
+ L_R_att = Attention02()(L_R,mask=K.squeeze(L_R_mask,axis=-1))
|
|
|
|
+
|
|
|
|
+ L_att = layers.add([L_att,L_R_att])
|
|
|
|
+ R_att = layers.add([R_att,L_R_att])
|
|
concat = layers.merge([L_att, R_att], mode='concat')
|
|
concat = layers.merge([L_att, R_att], mode='concat')
|
|
- concat = Dropout(0.3)(concat)
|
|
|
|
|
|
+
|
|
|
|
+ concat = Dropout(0.2)(concat)
|
|
output = layers.Dense(output_shape[0],activation="softmax")(concat)
|
|
output = layers.Dense(output_shape[0],activation="softmax")(concat)
|
|
|
|
|
|
model = models.Model(inputs=[L_input,R_input], outputs=output)
|
|
model = models.Model(inputs=[L_input,R_input], outputs=output)
|
|
@@ -111,6 +140,36 @@ def getModel2():
|
|
metrics=[precision,recall,f1_score])
|
|
metrics=[precision,recall,f1_score])
|
|
model.summary()
|
|
model.summary()
|
|
return model
|
|
return model
|
|
|
|
+# def getModel2():
|
|
|
|
+# '''
|
|
|
|
+# @summary: 时间分类模型
|
|
|
|
+# '''
|
|
|
|
+# L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
|
|
|
|
+# L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
|
|
|
|
+# R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
|
|
|
|
+# R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
|
|
|
|
+#
|
|
|
|
+# L_input_drop = Dropout(0.3)(L_input)
|
|
|
|
+# R_input_drop = Dropout(0.3)(R_input)
|
|
|
|
+# # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
|
|
|
|
+# L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
|
|
|
|
+# L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
|
|
|
|
+# # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
|
|
|
|
+# R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
|
|
|
|
+# R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
|
|
|
|
+# concat = layers.merge([L_att, R_att], mode='concat')
|
|
|
|
+#
|
|
|
|
+# concat = Dropout(0.2)(concat)
|
|
|
|
+# output = layers.Dense(output_shape[0],activation="softmax")(concat)
|
|
|
|
+#
|
|
|
|
+# model = models.Model(inputs=[L_input,R_input], outputs=output)
|
|
|
|
+#
|
|
|
|
+# learn_rate = 0.00005
|
|
|
|
+# model.compile(optimizer=optimizers.Adam(lr=learn_rate),
|
|
|
|
+# loss=losses.binary_crossentropy,
|
|
|
|
+# metrics=[precision,recall,f1_score])
|
|
|
|
+# model.summary()
|
|
|
|
+# return model
|
|
|
|
|
|
def getModel3():
|
|
def getModel3():
|
|
'''
|
|
'''
|
|
@@ -121,8 +180,8 @@ def getModel3():
|
|
R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
|
|
R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
|
|
R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
|
|
R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
|
|
|
|
|
|
- L_input_drop = Dropout(0.2)(L_input)
|
|
|
|
- R_input_drop = Dropout(0.2)(R_input)
|
|
|
|
|
|
+ L_input_drop = Dropout(0.3)(L_input)
|
|
|
|
+ R_input_drop = Dropout(0.3)(R_input)
|
|
# L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
|
|
# L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
|
|
L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
|
|
L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
|
|
# L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
|
|
# L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
|
|
@@ -133,7 +192,7 @@ def getModel3():
|
|
att = Attention02()(concat,mask=K.squeeze(concat_mask,axis=-1))
|
|
att = Attention02()(concat,mask=K.squeeze(concat_mask,axis=-1))
|
|
# R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
|
|
# R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
|
|
# concat = layers.merge([L_att, R_att], mode='concat')
|
|
# concat = layers.merge([L_att, R_att], mode='concat')
|
|
- att = Dropout(0.3)(att)
|
|
|
|
|
|
+ att = Dropout(0.2)(att)
|
|
output = layers.Dense(output_shape[0],activation="softmax")(att)
|
|
output = layers.Dense(output_shape[0],activation="softmax")(att)
|
|
|
|
|
|
model = models.Model(inputs=[L_input,R_input], outputs=output)
|
|
model = models.Model(inputs=[L_input,R_input], outputs=output)
|
|
@@ -145,6 +204,72 @@ def getModel3():
|
|
model.summary()
|
|
model.summary()
|
|
return model
|
|
return model
|
|
|
|
|
|
|
|
+class Attention(Layer):
|
|
|
|
+ """多头注意力机制
|
|
|
|
+ """
|
|
|
|
+ def __init__(self, nb_head, size_per_head, **kwargs):
|
|
|
|
+ self.nb_head = nb_head
|
|
|
|
+ self.size_per_head = size_per_head
|
|
|
|
+ self.out_dim = nb_head * size_per_head
|
|
|
|
+ super(Attention, self).__init__(**kwargs)
|
|
|
|
+ def build(self, input_shape):
|
|
|
|
+ super(Attention, self).build(input_shape)
|
|
|
|
+ q_in_dim = input_shape[0][-1]
|
|
|
|
+ k_in_dim = input_shape[1][-1]
|
|
|
|
+ v_in_dim = input_shape[2][-1]
|
|
|
|
+ self.q_kernel = self.add_weight(name='q_kernel',
|
|
|
|
+ shape=(q_in_dim, self.out_dim),
|
|
|
|
+ initializer='glorot_normal')
|
|
|
|
+ self.k_kernel = self.add_weight(name='k_kernel',
|
|
|
|
+ shape=(k_in_dim, self.out_dim),
|
|
|
|
+ initializer='glorot_normal')
|
|
|
|
+ self.v_kernel = self.add_weight(name='w_kernel',
|
|
|
|
+ shape=(v_in_dim, self.out_dim),
|
|
|
|
+ initializer='glorot_normal')
|
|
|
|
+ def mask(self, x, mask, mode='mul'):
|
|
|
|
+ if mask is None:
|
|
|
|
+ return x
|
|
|
|
+ else:
|
|
|
|
+ for _ in range(K.ndim(x) - K.ndim(mask)):
|
|
|
|
+ mask = K.expand_dims(mask, K.ndim(mask))
|
|
|
|
+ if mode == 'mul':
|
|
|
|
+ return x * mask
|
|
|
|
+ else:
|
|
|
|
+ return x - (1 - mask) * 1e10
|
|
|
|
+ def call(self, inputs):
|
|
|
|
+ q, k, v = inputs[:3]
|
|
|
|
+ v_mask, q_mask = None, None
|
|
|
|
+ if len(inputs) > 3:
|
|
|
|
+ v_mask = inputs[3]
|
|
|
|
+ if len(inputs) > 4:
|
|
|
|
+ q_mask = inputs[4]
|
|
|
|
+ # 线性变换
|
|
|
|
+ qw = K.dot(q, self.q_kernel)
|
|
|
|
+ kw = K.dot(k, self.k_kernel)
|
|
|
|
+ vw = K.dot(v, self.v_kernel)
|
|
|
|
+ # 形状变换
|
|
|
|
+ qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head))
|
|
|
|
+ kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head))
|
|
|
|
+ vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head))
|
|
|
|
+ # 维度置换
|
|
|
|
+ qw = K.permute_dimensions(qw, (0, 2, 1, 3))
|
|
|
|
+ kw = K.permute_dimensions(kw, (0, 2, 1, 3))
|
|
|
|
+ vw = K.permute_dimensions(vw, (0, 2, 1, 3))
|
|
|
|
+ # Attention
|
|
|
|
+ a = K.batch_dot(qw, kw, [3, 3]) / self.size_per_head**0.5
|
|
|
|
+ a = K.permute_dimensions(a, (0, 3, 2, 1))
|
|
|
|
+ a = self.mask(a, v_mask, 'add')
|
|
|
|
+ a = K.permute_dimensions(a, (0, 3, 2, 1))
|
|
|
|
+ a = K.softmax(a)
|
|
|
|
+ # 完成输出
|
|
|
|
+ o = K.batch_dot(a, vw, [3, 2])
|
|
|
|
+ o = K.permute_dimensions(o, (0, 2, 1, 3))
|
|
|
|
+ o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
|
|
|
|
+ o = self.mask(o, q_mask, 'mul')
|
|
|
|
+ return o
|
|
|
|
+ def compute_output_shape(self, input_shape):
|
|
|
|
+ return (input_shape[0][0], input_shape[0][1], self.out_dim)
|
|
|
|
+
|
|
class Attention02(Layer):
|
|
class Attention02(Layer):
|
|
def __init__(self, **kwargs):
|
|
def __init__(self, **kwargs):
|
|
self.init = initializers.get('normal')
|
|
self.init = initializers.get('normal')
|
|
@@ -530,11 +655,216 @@ def train3():
|
|
# # y_pre2 = load_model.predict(train_x[0])
|
|
# # y_pre2 = load_model.predict(train_x[0])
|
|
# res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
|
|
# res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
|
|
# print(res2)
|
|
# print(res2)
|
|
|
|
+
|
|
|
|
+def train4():
|
|
|
|
+ # data_load = pd.read_excel("tokens_tolabel_data1.xlsx", index_col=0)
|
|
|
|
+ data_load = pd.read_excel("tokens_tolabel_data1_res13New.xlsx", index_col=0)
|
|
|
|
+ # data_load = pd.concat([data_load[data_load['re_label']==0],data_load])
|
|
|
|
+ # data_load = data_load[data_load['pre_label_prob']>0.97]
|
|
|
|
+ # data_load = data_load[data_load['is_same']==1]
|
|
|
|
+ data_zero = pd.read_excel("time_entity5.xlsx")
|
|
|
|
+ data_zero = data_zero[(data_zero['viewed']==1)|(data_zero['is_same']==2)]
|
|
|
|
+ # data_old = pd.read_excel("tokens_data_02.xlsx")
|
|
|
|
+ data_old = pd.read_excel("tokens_data_02_res7New.xlsx")
|
|
|
|
+ data_delay1 = pd.read_excel("delayTime_entity1.xlsx")
|
|
|
|
+ data_delay1 = data_delay1[data_delay1['label']!=0]
|
|
|
|
+ data_delay2 = pd.read_excel("delayTime_entity2.xlsx")
|
|
|
|
+
|
|
|
|
+ # data_zero = pd.concat([data_zero,data_zero])
|
|
|
|
+ # data_zero = pd.concat([data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)],data_zero.sample(n=3000)])
|
|
|
|
+ # data_zero = data_zero.sample(n=80000)
|
|
|
|
+ print("输入shape:",input_shape2)
|
|
|
|
+ data_x = []
|
|
|
|
+ data_y = []
|
|
|
|
+ import random
|
|
|
|
+ for left, right, label,_label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label'], data_load['label']):
|
|
|
|
+ # if label==_label:
|
|
|
|
+
|
|
|
|
+ y = np.zeros(output_shape)
|
|
|
|
+ y[label] = 1
|
|
|
|
+ left = eval(left)
|
|
|
|
+ left = left[-40:]
|
|
|
|
+ right = eval(right)
|
|
|
|
+ right = right[:40]
|
|
|
|
+ context = [left, right]
|
|
|
|
+ # x = embedding(context, shape=input_shape2)
|
|
|
|
+ data_x.append(context)
|
|
|
|
+ data_y.append(y)
|
|
|
|
+ # data_load2 = data_load[data_load['re_label']==0]
|
|
|
|
+ # for left, right, label,_label in zip(data_load2['context_left'], data_load2['context_right'], data_load2['re_label'], data_load2['label']):
|
|
|
|
+ # if label==_label:
|
|
|
|
+ # y = np.zeros(output_shape)
|
|
|
|
+ # y[label] = 1
|
|
|
|
+ # left = eval(left)
|
|
|
|
+ # left = left[-40:]
|
|
|
|
+ # if len(left)>30:
|
|
|
|
+ # left = left[2:]
|
|
|
|
+ # elif len(left)>15:
|
|
|
|
+ # left = left[1:]
|
|
|
|
+ # right = eval(right)
|
|
|
|
+ # right = right[:40]
|
|
|
|
+ # if len(right)>15:
|
|
|
|
+ # right = right[:-1]
|
|
|
|
+ # context = [left, right]
|
|
|
|
+ # # x = embedding(context, shape=input_shape2)
|
|
|
|
+ # data_x.append(context)
|
|
|
|
+ # data_y.append(y)
|
|
|
|
+
|
|
|
|
+ for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['re_label']):
|
|
|
|
+
|
|
|
|
+ y = np.zeros(output_shape)
|
|
|
|
+ y[label] = 1
|
|
|
|
+ left = eval(left)
|
|
|
|
+ left = left[-40:]
|
|
|
|
+ right = eval(right)
|
|
|
|
+ right = right[:40]
|
|
|
|
+ context = [left, right]
|
|
|
|
+ # x = embedding(context, shape=input_shape2)
|
|
|
|
+ data_x.append(context)
|
|
|
|
+ data_y.append(y)
|
|
|
|
+
|
|
|
|
+ for left, right, label in zip(data_delay1['context_left'], data_delay1['context_right'], data_delay1['label']):
|
|
|
|
+ y = np.zeros(output_shape)
|
|
|
|
+ y[label] = 1
|
|
|
|
+ left = eval(left)
|
|
|
|
+ left = left[-40:]
|
|
|
|
+ right = eval(right)
|
|
|
|
+ right = right[:40]
|
|
|
|
+ context = [left, right]
|
|
|
|
+ # x = embedding(context, shape=input_shape2)
|
|
|
|
+ data_x.append(context)
|
|
|
|
+ data_y.append(y)
|
|
|
|
+ for left, right, label in zip(data_delay2['context_left'], data_delay2['context_right'], data_delay2['re_label']):
|
|
|
|
+ y = np.zeros(output_shape)
|
|
|
|
+ y[label] = 1
|
|
|
|
+ left = eval(left)
|
|
|
|
+ left = left[-40:]
|
|
|
|
+ right = eval(right)
|
|
|
|
+ right = right[:40]
|
|
|
|
+ context = [left, right]
|
|
|
|
+ # x = embedding(context, shape=input_shape2)
|
|
|
|
+ data_x.append(context)
|
|
|
|
+ data_y.append(y)
|
|
|
|
+
|
|
|
|
+ # for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
|
|
|
|
+ # y = np.zeros(output_shape)
|
|
|
|
+ # y[label] = 1
|
|
|
|
+ # left = eval(left)
|
|
|
|
+ # left = left[-40:]
|
|
|
|
+ # if len(left) > 30:
|
|
|
|
+ # left = left[2:]
|
|
|
|
+ # elif len(left) > 15:
|
|
|
|
+ # left = left[1:]
|
|
|
|
+ # right = eval(right)
|
|
|
|
+ # right = right[:40]
|
|
|
|
+ # if len(right) > 15:
|
|
|
|
+ # right = right[:-1]
|
|
|
|
+ # context = [left, right]
|
|
|
|
+ # # x = embedding(context, shape=input_shape2)
|
|
|
|
+ # data_x.append(context)
|
|
|
|
+ # data_y.append(y)
|
|
|
|
+
|
|
|
|
+ # for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
|
|
|
|
+ # y = np.zeros(output_shape)
|
|
|
|
+ # y[label] = 1
|
|
|
|
+ # left = eval(left)
|
|
|
|
+ # left = left[-40:]
|
|
|
|
+ # right = eval(right)
|
|
|
|
+ # right = right[:40]
|
|
|
|
+ # context = [left, right]
|
|
|
|
+ # # x = embedding(context, shape=input_shape2)
|
|
|
|
+ # data_x.append(context)
|
|
|
|
+ # data_y.append(y)
|
|
|
|
+ for left, right, label,pre_label,is_same in zip(data_old['context_left'], data_old['context_right'], data_old['label'],
|
|
|
|
+ data_old['pre_label'],data_old['is_same']):
|
|
|
|
+ if label==0:
|
|
|
|
+ if is_same==1:
|
|
|
|
+ pass
|
|
|
|
+ else:
|
|
|
|
+ if pre_label>3:
|
|
|
|
+ label = pre_label
|
|
|
|
+ else:
|
|
|
|
+ continue
|
|
|
|
+ y = np.zeros(output_shape)
|
|
|
|
+ y[label] = 1
|
|
|
|
+ left = eval(left)
|
|
|
|
+ left = left[-40:]
|
|
|
|
+ right = eval(right)
|
|
|
|
+ right = right[:40]
|
|
|
|
+ context = [left, right]
|
|
|
|
+ # x = embedding(context, shape=input_shape2)
|
|
|
|
+ data_x.append(context)
|
|
|
|
+ data_y.append(y)
|
|
|
|
+
|
|
|
|
+ _data = [d for d in zip(data_x,data_y)]
|
|
|
|
+ random.shuffle(_data)
|
|
|
|
+ data_x = [i[0] for i in _data]
|
|
|
|
+ data_y = [i[1] for i in _data]
|
|
|
|
+ test_len = int(len(data_x) * 0.11)
|
|
|
|
+ test_x = data_x[:test_len]
|
|
|
|
+ test_y = data_y[:test_len]
|
|
|
|
+ print("测试数据量:", len(test_x))
|
|
|
|
+ train_x = data_x[test_len:]
|
|
|
|
+ train_y = data_y[test_len:]
|
|
|
|
+
|
|
|
|
+ # for left, right, label,pre_label,is_same in zip(data_old['context_left'], data_old['context_right'], data_old['label'],
|
|
|
|
+ # data_old['pre_label'],data_old['is_same']):
|
|
|
|
+ # # if label==0:
|
|
|
|
+ # # if random.random()>0.25:
|
|
|
|
+ # # continue
|
|
|
|
+ # if label==0:
|
|
|
|
+ # if is_same==1:
|
|
|
|
+ # pass
|
|
|
|
+ # else:
|
|
|
|
+ # if pre_label>3:
|
|
|
|
+ # label = pre_label
|
|
|
|
+ # else:
|
|
|
|
+ # continue
|
|
|
|
+ # y = np.zeros(output_shape)
|
|
|
|
+ # y[label] = 1
|
|
|
|
+ # left = eval(left)
|
|
|
|
+ # left = left[-40:]
|
|
|
|
+ # right = eval(right)
|
|
|
|
+ # right = right[:40]
|
|
|
|
+ # context = [left, right]
|
|
|
|
+ # # x = embedding(context, shape=input_shape2)
|
|
|
|
+ # train_x.append(context)
|
|
|
|
+ # train_y.append(y)
|
|
|
|
+ print("训练数据量:", len(train_x))
|
|
|
|
+
|
|
|
|
+ # train_y, test_y = np.array(train_y), np.array(test_y)
|
|
|
|
+ # train_x = np.array(train_x)
|
|
|
|
+ # test_x = np.array(test_x)
|
|
|
|
+ # test_x = np.transpose(test_x, (1, 0, 2, 3))
|
|
|
|
+ # train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
|
|
|
|
+ training_generator = DataGenerator(train_x, train_y,is_train=True)
|
|
|
|
+ # training_generator = DataGenerator(data_x, data_y)
|
|
|
|
+ validation_generator = DataGenerator(test_x, test_y,is_train=False,shuffle=False)
|
|
|
|
+
|
|
|
|
+ # model = getModel3()
|
|
|
|
+ model = getModel2()
|
|
|
|
+ epochs = 100
|
|
|
|
+ # batch_size = 256
|
|
|
|
+ checkpoint = ModelCheckpoint("model_time_classify.weights",save_weights_only=True, monitor="val_loss", verbose=1,
|
|
|
|
+ save_best_only=True, mode='min')
|
|
|
|
+ # checkpoint = ModelCheckpoint("model_time_classify2.weights",save_weights_only=True, monitor="loss", verbose=1,
|
|
|
|
+ # save_best_only=True, mode='min')
|
|
|
|
+
|
|
|
|
+ history = model.fit_generator(
|
|
|
|
+ generator=training_generator,
|
|
|
|
+ validation_data=validation_generator,
|
|
|
|
+ use_multiprocessing=True, workers=2,
|
|
|
|
+ epochs=epochs,
|
|
|
|
+ shuffle=True,
|
|
|
|
+ callbacks=[checkpoint],
|
|
|
|
+ class_weight='auto'
|
|
|
|
+ )
|
|
|
|
+
|
|
from keras.utils import Sequence,to_categorical
|
|
from keras.utils import Sequence,to_categorical
|
|
class DataGenerator(Sequence):
|
|
class DataGenerator(Sequence):
|
|
'Generates data for Keras'
|
|
'Generates data for Keras'
|
|
- def __init__(self, texts, labels, batch_size=256,
|
|
|
|
- n_classes=4, shuffle=True):
|
|
|
|
|
|
+ def __init__(self, texts, labels, is_train=True,batch_size=256,
|
|
|
|
+ n_classes=len(time_label_dict), shuffle=True):
|
|
'Initialization'
|
|
'Initialization'
|
|
# self.dim = dim
|
|
# self.dim = dim
|
|
self.batch_size = batch_size
|
|
self.batch_size = batch_size
|
|
@@ -542,6 +872,7 @@ class DataGenerator(Sequence):
|
|
self.texts = texts
|
|
self.texts = texts
|
|
self.n_classes = n_classes
|
|
self.n_classes = n_classes
|
|
self.shuffle = shuffle
|
|
self.shuffle = shuffle
|
|
|
|
+ self.is_train = is_train
|
|
self.on_epoch_end()
|
|
self.on_epoch_end()
|
|
|
|
|
|
def __len__(self):
|
|
def __len__(self):
|
|
@@ -583,8 +914,22 @@ class DataGenerator(Sequence):
|
|
# Generate data
|
|
# Generate data
|
|
for i, context in enumerate(list_texts):
|
|
for i, context in enumerate(list_texts):
|
|
# Store sample
|
|
# Store sample
|
|
- # tokens = preprocess2(text)
|
|
|
|
- # tokens = tokens[:maxlen]
|
|
|
|
|
|
+ if self.is_train:
|
|
|
|
+ left = context[0]
|
|
|
|
+ if len(left) > 30:
|
|
|
|
+ if random.random() > 0.5:
|
|
|
|
+ left = left[2:]
|
|
|
|
+ elif len(left) > 15:
|
|
|
|
+ if random.random() > 0.5:
|
|
|
|
+ left = left[1:]
|
|
|
|
+ right = context[1]
|
|
|
|
+ if len(right) > 30:
|
|
|
|
+ if random.random() > 0.5:
|
|
|
|
+ right = right[:-2]
|
|
|
|
+ elif len(right) > 15:
|
|
|
|
+ if random.random() > 0.5:
|
|
|
|
+ right = right[:-1]
|
|
|
|
+ context = [left, right]
|
|
words_matrix = embedding_mywords(context, shape=input_shape2)
|
|
words_matrix = embedding_mywords(context, shape=input_shape2)
|
|
# Store class
|
|
# Store class
|
|
# y[i] = _label[i]
|
|
# y[i] = _label[i]
|
|
@@ -647,7 +992,11 @@ def predict3():
|
|
new_data.to_excel("new_tokens_data1_res.xlsx")
|
|
new_data.to_excel("new_tokens_data1_res.xlsx")
|
|
|
|
|
|
def predict4():
|
|
def predict4():
|
|
- data = pd.read_csv("tokens_tolabel_data1_res11.csv", chunksize=3000)
|
|
|
|
|
|
+ data = pd.read_csv("tokens_data_02_res6New.csv", chunksize=3000)
|
|
|
|
+ # data = pd.read_excel("C:\\Users\\Administrator\\Desktop\\time_entity4.xlsx")
|
|
|
|
+ # data.to_csv("C:\\Users\\Administrator\\Desktop\\time_entity4.csv")
|
|
|
|
+ # data = pd.read_csv("C:\\Users\\Administrator\\Desktop\\time_entity4.csv", chunksize=3000)
|
|
|
|
+
|
|
model1 = getModel2()
|
|
model1 = getModel2()
|
|
model1.load_weights("model_time_classify.weights")
|
|
model1.load_weights("model_time_classify.weights")
|
|
new_data = pd.DataFrame()
|
|
new_data = pd.DataFrame()
|
|
@@ -671,14 +1020,15 @@ def predict4():
|
|
pre_y = model1.predict([test_x[0], test_x[1]])
|
|
pre_y = model1.predict([test_x[0], test_x[1]])
|
|
_data['pre_label'] = [np.argmax(item) for item in pre_y]
|
|
_data['pre_label'] = [np.argmax(item) for item in pre_y]
|
|
_data['pre_label_prob'] = [max(item) for item in pre_y]
|
|
_data['pre_label_prob'] = [max(item) for item in pre_y]
|
|
- _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['re_label'],_data['pre_label'])]
|
|
|
|
|
|
+ _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['label'],_data['pre_label'])]
|
|
# _data['is_same'] = [1 if int(_re)==int(_pre) and int(_re)==int(_label) else 0 for _label,_re,_pre in zip(_data['label'],_data['re_label'],_data['pre_label'])]
|
|
# _data['is_same'] = [1 if int(_re)==int(_pre) and int(_re)==int(_label) else 0 for _label,_re,_pre in zip(_data['label'],_data['re_label'],_data['pre_label'])]
|
|
# data['label'] = label
|
|
# data['label'] = label
|
|
new_data = pd.concat([new_data, _data])
|
|
new_data = pd.concat([new_data, _data])
|
|
idx += 3000
|
|
idx += 3000
|
|
print(idx)
|
|
print(idx)
|
|
- # data.to_csv("new_tokens_data1.csv")
|
|
|
|
- new_data.to_excel("tokens_tolabel_data1_res12.xlsx")
|
|
|
|
|
|
+ # new_data.to_csv("tokens_data_02_res7New.csv")
|
|
|
|
+ new_data.to_excel("tokens_data_02_res7New.xlsx")
|
|
|
|
+ # new_data.to_excel("C:\\Users\\Administrator\\Desktop\\tokens_data_02_res7New.xlsx")
|
|
|
|
|
|
|
|
|
|
def predict():
|
|
def predict():
|
|
@@ -863,7 +1213,7 @@ def save_model():
|
|
test_model = getModel2()
|
|
test_model = getModel2()
|
|
test_model.load_weights("model_time_classify.weights")
|
|
test_model.load_weights("model_time_classify.weights")
|
|
tf.saved_model.simple_save(sess,
|
|
tf.saved_model.simple_save(sess,
|
|
- "models/timesplit_model/",
|
|
|
|
|
|
+ "models/timesplit_model2/",
|
|
inputs={"input0": test_model.input[0],
|
|
inputs={"input0": test_model.input[0],
|
|
"input1":test_model.input[1]
|
|
"input1":test_model.input[1]
|
|
},
|
|
},
|
|
@@ -879,6 +1229,7 @@ if __name__ == '__main__':
|
|
# training()
|
|
# training()
|
|
# train2()
|
|
# train2()
|
|
# train3()
|
|
# train3()
|
|
|
|
+ # train4()
|
|
# data_process()
|
|
# data_process()
|
|
# data_process2()
|
|
# data_process2()
|
|
# data_process3()
|
|
# data_process3()
|