|
@@ -13,14 +13,52 @@ from sklearn.utils import shuffle,class_weight
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
input_shape = (2,30,60)
|
|
|
+input_shape2 = (2,10,128)
|
|
|
output_shape = [4]
|
|
|
|
|
|
+def get_data():
|
|
|
+ data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
|
|
|
+ id_set = set()
|
|
|
+ for id in data_load['document_id']:
|
|
|
+ id_set.add(id)
|
|
|
+ conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.101")
|
|
|
+ sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value " \
|
|
|
+ "FROM corpus_iedocument A,brat_bratannotation B " \
|
|
|
+ "WHERE A.human_identifier = '%s' " \
|
|
|
+ "AND A.human_identifier = B.document_id "
|
|
|
+ db_data = []
|
|
|
+ count = 0
|
|
|
+ for id in list(id_set):
|
|
|
+ count+=1
|
|
|
+ print(count)
|
|
|
+ cur1 = conn.cursor()
|
|
|
+ cur1.execute(sql % (id))
|
|
|
+ db_data.extend(cur1.fetchall())
|
|
|
+ cur1.close()
|
|
|
+ conn.close()
|
|
|
+ columns = ['document_id','sentences','tokens','offsets_to_text','value']
|
|
|
+ df = pd.DataFrame(db_data, columns=columns)
|
|
|
+ df = df[df['value'].str.contains('time')]
|
|
|
+ df = df.reset_index(drop=True)
|
|
|
+ print(len(df))
|
|
|
+ time_label = df['value'].str.split(expand=True)
|
|
|
+ time_label.columns = ['_', 'label_type', 'begin_index', 'end_index', 'entity_text']
|
|
|
+ time_label = time_label.drop('_', axis=1)
|
|
|
+ df = pd.concat([df, time_label], axis=1)
|
|
|
+ print(df.info())
|
|
|
+ df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']]
|
|
|
+ df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
|
|
|
+ df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
|
|
|
+ df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
|
|
|
+ df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
|
|
|
+ save(df,'db_time_data.pk')
|
|
|
+
|
|
|
def getModel():
|
|
|
'''
|
|
|
@summary: 时间分类模型
|
|
|
'''
|
|
|
- L_input = layers.Input(shape=input_shape[1:], dtype='float32')
|
|
|
- R_input = layers.Input(shape=input_shape[1:], dtype='float32')
|
|
|
+ L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
|
|
|
+ R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
|
|
|
L_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(L_input)
|
|
|
# L_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(L_input)
|
|
|
avg_l = layers.GlobalAveragePooling1D()(L_lstm)
|
|
@@ -40,36 +78,11 @@ def getModel():
|
|
|
model.summary()
|
|
|
return model
|
|
|
|
|
|
-def getModel_center():
|
|
|
- '''
|
|
|
- @summary: 时间分类模型
|
|
|
- '''
|
|
|
- L_input = layers.Input(shape=input_shape[1:], dtype='float32')
|
|
|
- R_input = layers.Input(shape=input_shape[1:], dtype='float32')
|
|
|
- center_shape = (25, 60)
|
|
|
- C_input = layers.Input(shape=center_shape, dtype='float32')
|
|
|
- L_lstm = layers.Bidirectional(layers.LSTM(32,return_sequences=True,dropout=0.2))(L_input)
|
|
|
- avg_l = layers.GlobalAveragePooling1D()(L_lstm)
|
|
|
- C_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(C_input)
|
|
|
- avg_c = layers.GlobalAveragePooling1D()(C_lstm)
|
|
|
- R_lstm = layers.Bidirectional(layers.LSTM(32,return_sequences=True,dropout=0.2))(R_input)
|
|
|
- avg_r = layers.GlobalAveragePooling1D()(R_lstm)
|
|
|
- concat = layers.merge([avg_l, avg_c, avg_r], mode='concat')
|
|
|
-
|
|
|
- output = layers.Dense(output_shape[0],activation="softmax")(concat)
|
|
|
-
|
|
|
- model = models.Model(inputs=[L_input,C_input,R_input], outputs=output)
|
|
|
- learn_rate = 0.0005
|
|
|
- model.compile(optimizer=optimizers.Adam(lr=learn_rate),
|
|
|
- loss=losses.binary_crossentropy,
|
|
|
- metrics=[precision,recall,f1_score])
|
|
|
- model.summary()
|
|
|
- return model
|
|
|
-
|
|
|
|
|
|
def training():
|
|
|
data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
|
|
|
- test_data = data_load.sample(frac=0.2, random_state=7)
|
|
|
+ data_load = data_load.reset_index(drop=True)
|
|
|
+ test_data = data_load.sample(frac=0.2, random_state=8)
|
|
|
train_data = data_load.drop(test_data.index, axis=0)
|
|
|
train_data =train_data.reset_index(drop=True)
|
|
|
|
|
@@ -139,35 +152,32 @@ def training():
|
|
|
res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
|
|
|
print(res2)
|
|
|
|
|
|
-def training_center():
|
|
|
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata.csv", index_col=0)
|
|
|
- test_data = data_load.sample(frac=0.25, random_state=7)
|
|
|
+def train2():
|
|
|
+ data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0)
|
|
|
+ data_load = data_load.reset_index(drop=True)
|
|
|
+ data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']]
|
|
|
+ data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']]
|
|
|
+ test_data = data_load.sample(frac=0.2, random_state=8)
|
|
|
train_data = data_load.drop(test_data.index, axis=0)
|
|
|
train_data =train_data.reset_index(drop=True)
|
|
|
|
|
|
train_x = []
|
|
|
train_y = []
|
|
|
- for left, center, right, label in zip(train_data['context_left'], train_data['entity_time'], train_data['context_right'], train_data['re_label']):
|
|
|
+ for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['label']):
|
|
|
y = np.zeros(output_shape)
|
|
|
y[label] = 1
|
|
|
- left = ''.join(str(left))
|
|
|
- right = ''.join(str(right))
|
|
|
- center = ''.join(str(center))
|
|
|
- context = [left,center, right]
|
|
|
- x = embedding_word(context, shape=(3,25,60))
|
|
|
+ context = [left, right]
|
|
|
+ x = embedding(context, shape=input_shape2)
|
|
|
train_x.append(x)
|
|
|
train_y.append(y)
|
|
|
|
|
|
test_x = []
|
|
|
test_y = []
|
|
|
- for left, center, right, label in zip(test_data['context_left'], train_data['entity_time'], test_data['context_right'], test_data['re_label']):
|
|
|
+ for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['label']):
|
|
|
y = np.zeros(output_shape)
|
|
|
y[label] = 1
|
|
|
- left = ''.join(str(left))
|
|
|
- right = ''.join(str(right))
|
|
|
- center = ''.join(str(center))
|
|
|
- context = [left, center, right]
|
|
|
- x = embedding_word(context, shape=(3,25,60))
|
|
|
+ context = [left, right]
|
|
|
+ x = embedding(context, shape=input_shape2)
|
|
|
test_x.append(x)
|
|
|
test_y.append(y)
|
|
|
|
|
@@ -175,79 +185,83 @@ def training_center():
|
|
|
train_x, test_x = (np.array(train_x), np.array(test_x))
|
|
|
train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
|
|
|
|
|
|
- model = getModel_center()
|
|
|
- epochs = 70
|
|
|
+ model = getModel()
|
|
|
+ epochs = 150
|
|
|
batch_size = 256
|
|
|
checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
|
|
|
save_best_only=True, mode='min')
|
|
|
# cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
|
|
|
# cw = dict(enumerate(cw))
|
|
|
history = model.fit(
|
|
|
- x=[train_x[0], train_x[1], train_x[2]],
|
|
|
+ x=[train_x[0], train_x[1]],
|
|
|
y=train_y,
|
|
|
- validation_data=([test_x[0], test_x[1], test_x[2]], test_y),
|
|
|
- # validation_data=(test_x[0],test_y),
|
|
|
+ validation_data=([test_x[0], test_x[1]], test_y),
|
|
|
epochs=epochs,
|
|
|
batch_size=batch_size,
|
|
|
shuffle=True,
|
|
|
callbacks=[checkpoint],
|
|
|
class_weight='auto'
|
|
|
)
|
|
|
- plot_loss(history = history)
|
|
|
+ # plot_loss(history=history)
|
|
|
load_model = models.load_model("model_label_time_classify.model.hdf5",
|
|
|
custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
|
|
|
- y_pre = load_model.predict([test_x[0], test_x[1], test_x[2]])
|
|
|
+ y_pre = load_model.predict([test_x[0], test_x[1]])
|
|
|
# y_pre = load_model.predict(test_x[0])
|
|
|
# 各类别预测评估
|
|
|
res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
|
|
|
print(res1)
|
|
|
- y_pre2 = load_model.predict([train_x[0], train_x[1], train_x[2]])
|
|
|
+ y_pre2 = load_model.predict([train_x[0], train_x[1]])
|
|
|
# y_pre2 = load_model.predict(train_x[0])
|
|
|
res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
|
|
|
print(res2)
|
|
|
|
|
|
-def predict():
|
|
|
+
|
|
|
+def predict2():
|
|
|
model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
|
|
|
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
|
|
|
+ data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0)
|
|
|
+ data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']]
|
|
|
+ data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']]
|
|
|
test_x = []
|
|
|
test_y = []
|
|
|
- for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']):
|
|
|
+ for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['label']):
|
|
|
y = np.zeros(output_shape)
|
|
|
y[label] = 1
|
|
|
- left = ''.join(str(left))
|
|
|
- right = ''.join(str(right))
|
|
|
context = [left, right]
|
|
|
- x = embedding_word(context, shape=input_shape)
|
|
|
+ x = embedding(context, shape=input_shape2)
|
|
|
test_x.append(x)
|
|
|
test_y.append(y)
|
|
|
test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
|
|
|
pre_y = model1.predict([test_x[0],test_x[1]])
|
|
|
data_load['pre'] = [np.argmax(item) for item in pre_y]
|
|
|
- error_data = data_load[data_load['re_label']!=data_load['pre']]
|
|
|
+ error_data = data_load[data_load['label']!=data_load['pre']]
|
|
|
# print(error_data.info())
|
|
|
- error_data.to_csv("C:\\Users\\admin\\Desktop\\test\\error4-0.2-0.6_30.csv")
|
|
|
+ error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
|
|
|
|
|
|
-def predict_center():
|
|
|
+def predict():
|
|
|
model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
|
|
|
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata.csv", index_col=0)
|
|
|
+ data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
|
|
|
test_x = []
|
|
|
test_y = []
|
|
|
- for left, center, right, label in zip(data_load['context_left'],data_load['entity_time'], data_load['context_right'], data_load['re_label']):
|
|
|
+ for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']):
|
|
|
y = np.zeros(output_shape)
|
|
|
y[label] = 1
|
|
|
- left = ''.join(str(left))
|
|
|
- right = ''.join(str(right))
|
|
|
- center = ''.join(str(center))
|
|
|
- context = [left, center, right]
|
|
|
- x = embedding_word(context, shape=(3, 25, 60))
|
|
|
+ left = str(left)
|
|
|
+ right = str(right)
|
|
|
+ if left == 'nan': left = ''
|
|
|
+ if right == 'nan': right = ''
|
|
|
+ left = list(left)
|
|
|
+ right = list(right)
|
|
|
+ context = [left, right]
|
|
|
+ x = embedding_word(context, shape=input_shape)
|
|
|
test_x.append(x)
|
|
|
test_y.append(y)
|
|
|
test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
|
|
|
- pre_y = model1.predict([test_x[0],test_x[1],test_x[2]])
|
|
|
+ pre_y = model1.predict([test_x[0],test_x[1]])
|
|
|
data_load['pre'] = [np.argmax(item) for item in pre_y]
|
|
|
error_data = data_load[data_load['re_label']!=data_load['pre']]
|
|
|
# print(error_data.info())
|
|
|
- error_data.to_csv("C:\\Users\\admin\\Desktop\\test\\error_center.csv")
|
|
|
+ error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
|
|
|
+
|
|
|
|
|
|
def data_process():
|
|
|
data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
|
|
@@ -273,6 +287,93 @@ def data_process():
|
|
|
data_load['context_right'] = right_list
|
|
|
data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv")
|
|
|
|
|
|
+def data_process2():
|
|
|
+ data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
|
|
|
+ left_list = []
|
|
|
+ right_list = []
|
|
|
+ for left, right in zip(data_load['context_left'], data_load['context_right']):
|
|
|
+ left = str(left)
|
|
|
+ right = str(right)
|
|
|
+ if right=='nan':
|
|
|
+ right = ''
|
|
|
+ if left=='nan':
|
|
|
+ left = ''
|
|
|
+ left = left[max(len(left)-20,0):]
|
|
|
+ right = right[:20]
|
|
|
+ left_list.append(left)
|
|
|
+ right_list.append(right)
|
|
|
+ data_load['context_left'] = left_list
|
|
|
+ data_load['context_right'] = right_list
|
|
|
+ data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_20_prc.csv")
|
|
|
+
|
|
|
+def data_process3():
|
|
|
+ data = load('db_time_data.pk')
|
|
|
+ data = data.drop('value', axis=1)
|
|
|
+ token_begin = []
|
|
|
+ token_end = []
|
|
|
+ context_left = []
|
|
|
+ context_right = []
|
|
|
+ data2 = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc2.csv")
|
|
|
+ label = []
|
|
|
+ # data=data[:20]
|
|
|
+ for id,sentences,tokens,offset,begin,end,entity_text in zip(data['document_id'],data['sentences'],data['tokens'],data['offsets_to_text'],
|
|
|
+ data['begin_index'],data['end_index'],data['entity_text']):
|
|
|
+ _label = data2[(data2['document_id']==int(id)) & (data2['begin_index']==int(begin))][:1]
|
|
|
+ if not _label.empty:
|
|
|
+ _label = int(_label['re_label'])
|
|
|
+ else:
|
|
|
+ _label=0
|
|
|
+ label.append(_label)
|
|
|
+ begin = int(begin)
|
|
|
+ end = int(end)
|
|
|
+ entity_tbegin = 0
|
|
|
+ entity_tend = 0
|
|
|
+ find_begin = False
|
|
|
+
|
|
|
+ for t in range(len(offset)):
|
|
|
+ if not find_begin:
|
|
|
+ if offset[t]==begin:
|
|
|
+ entity_tbegin = t
|
|
|
+ find_begin = True
|
|
|
+ if offset[t]>begin:
|
|
|
+ entity_tbegin = t-1
|
|
|
+ find_begin = True
|
|
|
+ if offset[t] >= end:
|
|
|
+ entity_tend = t
|
|
|
+ break
|
|
|
+ token_begin.append(entity_tbegin)
|
|
|
+ token_end.append(entity_tend)
|
|
|
+ s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend,size=10)
|
|
|
+ s1 = s[0]
|
|
|
+ _temp1 = []
|
|
|
+ for i in range(len(s1)):
|
|
|
+ if s1[i]=="。":
|
|
|
+ _temp1.append(i)
|
|
|
+ if _temp1:
|
|
|
+ s1 = s1[_temp1[-1]+1:]
|
|
|
+ s2 = s[1]
|
|
|
+ _temp2 = []
|
|
|
+ for i in range(len(s2)):
|
|
|
+ if s2[i] == "。":
|
|
|
+ _temp2.append(i)
|
|
|
+ break
|
|
|
+ if _temp2:
|
|
|
+ s2 = s2[:_temp2[0]+1]
|
|
|
+ # print(s2)
|
|
|
+ context_left.append(s1)
|
|
|
+ context_right.append(s2)
|
|
|
+ print(id)
|
|
|
+ # print(_label)
|
|
|
+ # print(entity_text)
|
|
|
+ # print(tokens[entity_tbegin:entity_tend])
|
|
|
+ data['token_begin'] = token_begin
|
|
|
+ data['token_end'] = token_end
|
|
|
+ data['context_left'] = context_left
|
|
|
+ data['context_right'] = context_right
|
|
|
+ data['label'] = label
|
|
|
+ data = data.drop(['tokens','offsets_to_text','sentences'],axis=1)
|
|
|
+ data.to_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv")
|
|
|
+
|
|
|
def plot_loss(history):
|
|
|
plt.plot(history.history['loss'])
|
|
|
plt.plot(history.history['val_loss'])
|
|
@@ -283,25 +384,14 @@ def plot_loss(history):
|
|
|
plt.show()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
+ # get_data()
|
|
|
# getModel()
|
|
|
- # getModel_center()
|
|
|
# training()
|
|
|
+ # train2()
|
|
|
# data_process()
|
|
|
- # training_center()
|
|
|
+ # data_process2()
|
|
|
+ # data_process3()
|
|
|
# predict()
|
|
|
- # predict_center()
|
|
|
- model1 = models.load_model("model_label_time_classify.model.hdf5",
|
|
|
- custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
|
|
|
- test_x = []
|
|
|
- test_y = []
|
|
|
- left = '8675.20元人民币,(3)服务期限:'
|
|
|
- right = '(4)质量:符合竞争性磋商文件规定的质'
|
|
|
- context = [left, right]
|
|
|
- x = embedding_word(context, shape=input_shape)
|
|
|
- test_x.append(x)
|
|
|
- test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
|
|
|
- pre_y = model1.predict([test_x[0],test_x[1]])
|
|
|
- rs = [np.argmax(item) for item in pre_y]
|
|
|
- print(pre_y, rs)
|
|
|
+ # predict2()
|
|
|
|
|
|
pass
|