|
@@ -1,5 +1,6 @@
|
|
|
import sys
|
|
|
|
|
|
+import psycopg2
|
|
|
from keras.models import Model
|
|
|
from keras.layers import Input, LSTM, Dense
|
|
|
import numpy as np
|
|
@@ -351,8 +352,8 @@ def getData3(isTrain = True):
|
|
|
'''
|
|
|
:return:返回训练数据或测试数据的词嵌入,分前后两个句子,不包含中心词
|
|
|
'''
|
|
|
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
|
|
|
- df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
|
|
|
+ df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
|
|
|
+ df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000_new.csv")
|
|
|
|
|
|
test_data_len = df.shape[0] * 0.2
|
|
|
if isTrain:
|
|
@@ -404,10 +405,51 @@ def getData3(isTrain = True):
|
|
|
data_x.append(item_x)
|
|
|
data_y.append(item_y)
|
|
|
|
|
|
+ data_x1, data_y1 = getDataFromPG((2, 35, 128), [5])
|
|
|
+ data_x = data_x + data_x1
|
|
|
+ data_y = data_y + data_y1
|
|
|
print(np.array(data_x).shape, np.array(data_y).shape)
|
|
|
return np.transpose(np.array(data_x), (1, 0, 2, 3)), np.array(data_y), data_context
|
|
|
|
|
|
|
|
|
+def getDataFromPG(input_shape, output_shape):
|
|
|
+ conn = psycopg2.connect(dbname="BiddingKG", user="postgres", password="postgres",
|
|
|
+ host="192.168.2.101")
|
|
|
+ cursor = conn.cursor()
|
|
|
+ sql = "select B.tokens,A.begin_index,A.end_index,C.label,A.entity_id " \
|
|
|
+ "from train_entity_copy A,train_sentences_copy B,hand_label_person C " \
|
|
|
+ "where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index " \
|
|
|
+ "and A.entity_type='person' and A.entity_id=C.entity_id and C.label!=0 " \
|
|
|
+ "and C.label!=3;"
|
|
|
+ cursor.execute(sql)
|
|
|
+ print(sql)
|
|
|
+
|
|
|
+ data_x = []
|
|
|
+ data_y = []
|
|
|
+ rows = cursor.fetchmany(1000)
|
|
|
+ allLimit = 250000
|
|
|
+ all = 0
|
|
|
+ i = 0
|
|
|
+ while(rows):
|
|
|
+ for row in rows:
|
|
|
+ if all >= allLimit:
|
|
|
+ break
|
|
|
+ item_x = embedding(spanWindow(tokens=row[0], begin_index=row[1], end_index=row[2],
|
|
|
+ size=input_shape[1]), shape=input_shape)
|
|
|
+ # item_x = encodeInput(spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=10), word_len=50, word_flag=True,userFool=False)
|
|
|
+
|
|
|
+ # _span = spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=10,word_flag=False)
|
|
|
+ # item_x = encodeInput(_span, word_len=10, word_flag=False,userFool=False)
|
|
|
+ item_y = np.zeros(output_shape)
|
|
|
+ item_y[row[3]] = 1
|
|
|
+ all += 1
|
|
|
+ data_x.append(item_x)
|
|
|
+ data_y.append(item_y)
|
|
|
+ i += 1
|
|
|
+ rows = cursor.fetchmany(1000)
|
|
|
+ return data_x, data_y
|
|
|
+
|
|
|
+
|
|
|
def getData2(isTrain = True):
|
|
|
'''
|
|
|
:return:返回训练数据或测试数据的词嵌入,前后连成一个句子,包含中心词
|
|
@@ -518,7 +560,9 @@ def train():
|
|
|
|
|
|
# 回调checkpoint,保存loss最小的模型
|
|
|
checkpoint = ModelCheckpoint(model_file, monitor="val_loss", verbose=1, save_best_only=True, mode='min')
|
|
|
- history_model = model.fit(x=[train_x[0], train_x[1]], y=train_y, validation_data=([test_x[0], test_x[1]], test_y), epochs=200, batch_size=256, shuffle=True, callbacks=[checkpoint])
|
|
|
+ history_model = model.fit(x=[train_x[0], train_x[1]], class_weight='auto',
|
|
|
+ y=train_y, validation_data=([test_x[0], test_x[1]], test_y),
|
|
|
+ epochs=25, batch_size=256, shuffle=True, callbacks=[checkpoint])
|
|
|
# history_model = model.fit(x=[train_x[0], train_x[0]], y=train_y, validation_data=([test_x[0], test_x[0]], test_y), class_weight='auto', epochs=100, batch_size=256, shuffle=True, callbacks=[checkpoint])
|
|
|
# history_model = model.fit(x=[train_x[0], train_x[0]], y=train_y, validation_split=0.2, class_weight='auto', epochs=200, batch_size=256, shuffle=True, callbacks=[checkpoint])
|
|
|
|
|
@@ -544,9 +588,11 @@ def predict():
|
|
|
|
|
|
def predict2Csv():
|
|
|
df = pd.DataFrame(np.argmax(predict(), axis=1))
|
|
|
- df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
|
|
|
+ df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000_new.csv")
|
|
|
+ # df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
|
|
|
|
|
|
df1 = df1[0:3700]
|
|
|
+
|
|
|
df1["predict_Label"] = df
|
|
|
|
|
|
df1.to_csv("C:\\Users\\admin\\Desktop\\result3.csv")
|
|
@@ -570,7 +616,7 @@ def hdf52savemodel():
|
|
|
sess.run(tf.global_variables_initializer())
|
|
|
h5_to_graph(sess, graph, filepath)
|
|
|
tf.saved_model.simple_save(sess,
|
|
|
- "./person_save_model_new/",
|
|
|
+ "./person_savedmodel_new/",
|
|
|
inputs={"input0":time_model.input[0],
|
|
|
"input1":time_model.input[1]},
|
|
|
outputs={"outputs":time_model.output})
|
|
@@ -583,4 +629,7 @@ if __name__ == "__main__":
|
|
|
# predict2Csv()
|
|
|
hdf52savemodel()
|
|
|
|
|
|
- # getData3()
|
|
|
+ # getData3()
|
|
|
+ # x, y = getDataFromPG((2, 35, 128), [5])
|
|
|
+ # print(x)
|
|
|
+ # print(y)
|