소스 검색

增加数据重新训练联系人分类模型

Jiasheng 4 년 전
부모
커밋
9fcb0d621f

BIN
BiddingKG/dl/interface/person_savedmodel_new/saved_model.pb


BIN
BiddingKG/dl/interface/person_savedmodel_new/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/interface/person_savedmodel_new/variables/variables.index


BIN
BiddingKG/dl/test/model_person_classify_fjs.model.hdf5


BIN
BiddingKG/dl/test/person_save_model_new/saved_model.pb → BiddingKG/dl/test/person_savedmodel_new/saved_model.pb


BIN
BiddingKG/dl/test/person_save_model_new/variables/variables.data-00000-of-00001 → BiddingKG/dl/test/person_savedmodel_new/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/test/person_save_model_new/variables/variables.index → BiddingKG/dl/test/person_savedmodel_new/variables/variables.index


+ 56 - 1
BiddingKG/dl/test/test_data_fjs.py

@@ -1514,6 +1514,59 @@ def washData():
     # df.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_re_washed2.csv")
 
 
+def relabel():
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
+    # df = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
+    df1 = df
+    for index, row in df.iterrows():
+        if row["Label"] == 1:
+            df1["Label"][index] == 3
+        if row["Label"] == 2:
+            df1["Label"][index] == 1
+        if row["Label"] == 3:
+            df1["Label"][index] == 2
+
+    df2 = df1
+    for index, row in df1.iterrows():
+        if row["Label"] == 1:
+            ss = row["Sentence"].split("||")
+            forward = ss[0][-30:]
+            if "。 联系人" in forward or ", 联系人" in forward \
+                    or ", 联系 方式" in forward or "。 联系 方式" in forward:
+                df2["Label"][index] = 3
+
+        if row["Label"] == 2:
+            ss = row["Sentence"].split("||")
+            forward = ss[0][-30:]
+            if "。 联系人" in forward or ", 联系人" in forward \
+                    or ", 联系 方式" in forward or "。 联系 方式" in forward:
+                df2["Label"][index] = 3
+
+    df2 = df2[["Word", "Label", "Sentence", "BIO"]]
+    df2.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
+    # df2.to_csv("C:\\Users\\admin\\Desktop\\test2000_new.csv")
+
+
+def relabel2():
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
+    # df = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
+    df1 = df
+    for index, row in df1.iterrows():
+        if row["Label"] == 3:
+            ss = row["Sentence"].split("||")
+            forward = ss[0][-20:]
+            if "采购 " in forward and "窗口" not in forward and "公司" not in forward \
+                    and "窗口" not in forward and "文件" not in forward \
+                    and "质疑" not in forward and "中心" not in forward\
+                    and "处" not in forward:
+            # if "招标 " in forward:
+                print(forward)
+                df1["Label"][index] = 1
+    df1 = df1[["Word", "Label", "Sentence", "BIO"]]
+    # print(df1)
+    # df1.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
+
+
 if __name__ == "__main__":
     # Postgre2Data()
     # data2BIOData()
@@ -1563,4 +1616,6 @@ if __name__ == "__main__":
     # re_serviceTime2()
     # re_Accuracy("serviceTime_text1")
     # test_re()
-    re_serviceTime3()
+    # re_serviceTime3()
+    # relabel()
+    relabel2()

+ 55 - 6
BiddingKG/dl/test/test_model_fjs.py

@@ -1,5 +1,6 @@
 import sys
 
+import psycopg2
 from keras.models import Model
 from keras.layers import Input, LSTM, Dense
 import numpy as np
@@ -351,8 +352,8 @@ def getData3(isTrain = True):
     '''
     :return:返回训练数据或测试数据的词嵌入,分前后两个句子,不包含中心词
     '''
-    df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
-    df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
+    df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000_new.csv")
 
     test_data_len = df.shape[0] * 0.2
     if isTrain:
@@ -404,10 +405,51 @@ def getData3(isTrain = True):
         data_x.append(item_x)
         data_y.append(item_y)
 
+    data_x1, data_y1 = getDataFromPG((2, 35, 128), [5])
+    data_x = data_x + data_x1
+    data_y = data_y + data_y1
     print(np.array(data_x).shape, np.array(data_y).shape)
     return np.transpose(np.array(data_x), (1, 0, 2, 3)), np.array(data_y), data_context
 
 
+def getDataFromPG(input_shape, output_shape):
+    conn = psycopg2.connect(dbname="BiddingKG", user="postgres", password="postgres",
+                            host="192.168.2.101")
+    cursor = conn.cursor()
+    sql = "select B.tokens,A.begin_index,A.end_index,C.label,A.entity_id " \
+          "from train_entity_copy A,train_sentences_copy B,hand_label_person C " \
+          "where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index " \
+          "and A.entity_type='person' and A.entity_id=C.entity_id and C.label!=0 " \
+          "and C.label!=3;"
+    cursor.execute(sql)
+    print(sql)
+
+    data_x = []
+    data_y = []
+    rows = cursor.fetchmany(1000)
+    allLimit = 250000
+    all = 0
+    i = 0
+    while(rows):
+        for row in rows:
+            if all >= allLimit:
+                break
+            item_x = embedding(spanWindow(tokens=row[0], begin_index=row[1], end_index=row[2],
+                                          size=input_shape[1]), shape=input_shape)
+            # item_x = encodeInput(spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=10), word_len=50, word_flag=True,userFool=False)
+
+            # _span = spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=10,word_flag=False)
+            # item_x = encodeInput(_span, word_len=10, word_flag=False,userFool=False)
+            item_y = np.zeros(output_shape)
+            item_y[row[3]] = 1
+            all += 1
+            data_x.append(item_x)
+            data_y.append(item_y)
+            i += 1
+            rows = cursor.fetchmany(1000)
+    return data_x, data_y
+
+
 def getData2(isTrain = True):
     '''
     :return:返回训练数据或测试数据的词嵌入,前后连成一个句子,包含中心词
@@ -518,7 +560,9 @@ def train():
 
     # 回调checkpoint,保存loss最小的模型
     checkpoint = ModelCheckpoint(model_file, monitor="val_loss", verbose=1, save_best_only=True, mode='min')
-    history_model = model.fit(x=[train_x[0], train_x[1]], y=train_y, validation_data=([test_x[0], test_x[1]], test_y), epochs=200, batch_size=256, shuffle=True, callbacks=[checkpoint])
+    history_model = model.fit(x=[train_x[0], train_x[1]], class_weight='auto',
+                              y=train_y, validation_data=([test_x[0], test_x[1]], test_y),
+                              epochs=25, batch_size=256, shuffle=True, callbacks=[checkpoint])
     # history_model = model.fit(x=[train_x[0], train_x[0]], y=train_y, validation_data=([test_x[0], test_x[0]], test_y), class_weight='auto', epochs=100, batch_size=256, shuffle=True, callbacks=[checkpoint])
     # history_model = model.fit(x=[train_x[0], train_x[0]], y=train_y, validation_split=0.2, class_weight='auto', epochs=200, batch_size=256, shuffle=True, callbacks=[checkpoint])
 
@@ -544,9 +588,11 @@ def predict():
 
 def predict2Csv():
     df = pd.DataFrame(np.argmax(predict(), axis=1))
-    df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
+    df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000_new.csv")
+    # df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
 
     df1 = df1[0:3700]
+
     df1["predict_Label"] = df
 
     df1.to_csv("C:\\Users\\admin\\Desktop\\result3.csv")
@@ -570,7 +616,7 @@ def hdf52savemodel():
             sess.run(tf.global_variables_initializer())
             h5_to_graph(sess, graph, filepath)
             tf.saved_model.simple_save(sess,
-                                       "./person_save_model_new/",
+                                       "./person_savedmodel_new/",
                                        inputs={"input0":time_model.input[0],
                                                "input1":time_model.input[1]},
                                        outputs={"outputs":time_model.output})
@@ -583,4 +629,7 @@ if __name__ == "__main__":
     # predict2Csv()
     hdf52savemodel()
 
-    # getData3()
+    # getData3()
+    # x, y = getDataFromPG((2, 35, 128), [5])
+    # print(x)
+    # print(y)