Pārlūkot izejas kodu

更新使用模型识别表头

fangjiasheng 3 gadi atpakaļ
vecāks
revīzija
7553b89e1c

+ 1 - 0
.idea/compiler.xml

@@ -1,6 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="CompilerConfiguration">
+    <option name="BUILD_PROCESS_HEAP_SIZE" value="11000" />
     <bytecodeTargetLevel>
       <module name="BiddingKG" target="8" />
     </bytecodeTargetLevel>

+ 8 - 0
.idea/inspectionProfiles/Project_Default.xml

@@ -2,5 +2,13 @@
   <profile version="1.0">
     <option name="myName" value="Project Default" />
     <inspection_tool class="DuplicatedCode" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyInterpreterInspection" enabled="false" level="WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredIdentifiers">
+        <list>
+          <option value="tensorflow.nn.bidirectional_dynamic_rnn" />
+        </list>
+      </option>
+    </inspection_tool>
   </profile>
 </component>

+ 1 - 1
.idea/misc.xml

@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_15" project-jdk-name="Python 3.5 (BiddingKG)" project-jdk-type="Python SDK" />
   <component name="PythonCompatibilityInspectionAdvertiser">
     <option name="version" value="3" />
   </component>

+ 2 - 2
BiddingKG.iml

@@ -2,13 +2,13 @@
 <module type="JAVA_MODULE" version="4">
   <component name="FacetManager">
     <facet type="Python" name="Python">
-      <configuration sdkName="Python 3.5 (dl_nlp)" />
+      <configuration sdkName="Python 3.5 (BiddingKG)" />
     </facet>
   </component>
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
     <orderEntry type="jdk" jdkName="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
-    <orderEntry type="library" exported="" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
+    <orderEntry type="library" name="Python 3.5 (BiddingKG) interpreter library" level="application" />
   </component>
 </module>

+ 29 - 0
BiddingKG/dl/common/Utils.py

@@ -686,6 +686,35 @@ def embedding_word(datas,shape):
         out_index += 1
     return embed
 
+
+def embedding_word_forward(datas,shape):
+    '''
+    @summary:查找词汇对应的词向量
+    @param:
+        datas:词汇的list
+        shape:结果的shape
+    @return: array,返回对应shape的词嵌入
+    '''
+    model_w2v = getModel_word()
+    embed = np.zeros(shape)
+    length = shape[1]
+    out_index = 0
+    #print(datas)
+    for data in datas:
+        index = 0
+        for item in str(data)[:shape[1]]:
+            if index>=length:
+                break
+            if item in model_w2v.vocab:
+                embed[out_index][index] = model_w2v[item]
+                index += 1
+            else:
+                # embed[out_index][index] = model_w2v['unk']
+                index += 1
+        out_index += 1
+    return embed
+
+
 def formEncoding(text,shape=(100,60),expand=False):
     embedding = np.zeros(shape)
     word_model = getModel_word()

+ 25 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -8,6 +8,7 @@ import time
 import codecs
 
 from BiddingKG.dl.ratio.re_ratio import extract_ratio
+from BiddingKG.dl.table_head.predict import predict
 
 sys.setrecursionlimit(1000000)
 sys.path.append(os.path.abspath("../.."))
@@ -414,6 +415,28 @@ def tableToText(soup):
         
         return inner_table,head_list
 
+    def set_head_model(inner_table):
+        for i in range(len(inner_table)):
+            for j in range(len(inner_table[i])):
+                inner_table[i][j] = inner_table[i][j][0]
+
+        # 模型预测表头
+        predict_list = predict(inner_table)
+        with open(r"C:\Users\Administrator\Desktop\table_head_test.txt", "a") as f:
+            for i in range(len(predict_list)):
+                f.write(str(i) + " " + str(inner_table[i]) + "\n")
+                f.write(str(i) + " " + str(predict_list[i]) + "\n")
+            f.write("\n")
+
+        # print("table_list", inner_table)
+        # print("predict_list", predict_list)
+
+        for i in range(len(inner_table)):
+            for j in range(len(inner_table[i])):
+                inner_table[i][j] = [inner_table[i][j], int(predict_list[i][j])]
+        head_list = sliceTable(inner_table)
+        return inner_table, head_list
+
     def setHead_incontext(inner_table,pat_head,fix_value="~~",prob_min=0.5):
 
         data_x,data_position = getPredictor("form").getModel("context").encode(inner_table)
@@ -969,7 +992,8 @@ def tableToText(soup):
         if len(inner_table)>0 and len(inner_table[0])>0:
             #inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
             #inner_table,head_list = setHead_inline(inner_table)
-            inner_table,head_list = setHead_initem(inner_table,pat_head)
+            # inner_table, head_list = setHead_initem(inner_table,pat_head)
+            inner_table, head_list = set_head_model(inner_table)
             # inner_table,head_list = setHead_incontext(inner_table,pat_head)
             # print(inner_table)
             # for begin in range(len(head_list[:-1])):

+ 62 - 17
BiddingKG/dl/table_head/check_user_label_accuracy.py

@@ -8,15 +8,16 @@ def user_label_accuracy(update_user):
         sql = """
         select table_text, pre_label, post_label, id
         from label_table_head_info 
-        where update_user='""" + update_user + "' order by id desc limit 3000"
+        where update_user='""" + update_user + "' order by update_time"
     else:
         sql = """
         select table_text, pre_label, post_label, id
         from label_table_head_info 
-        where update_user='""" + update_user + "' and status = 1 and update_time >= '2022-01-17'"
+        where update_user='""" + update_user + "' and status = 1 and update_time >= '2022-01-23'"
 
     result_list = postgresql_util(sql, limit=1000000)
     right_cnt = 0
+    error_cnt = 0
     error_id_list = []
     right_id_list = []
     i = 0
@@ -24,7 +25,10 @@ def user_label_accuracy(update_user):
     for table in result_list:
         i += 1
         if i % 1000 == 0:
-            print("Loop", i, right_cnt, time.time()-start_time)
+            if right_cnt + error_cnt != 0:
+                print("Loop", i, right_cnt/(right_cnt+error_cnt), time.time()-start_time)
+            else:
+                print("Loop", i, time.time()-start_time)
             start_time = time.time()
 
         pre_label = eval(table[1])
@@ -49,27 +53,44 @@ def user_label_accuracy(update_user):
         else:
             label_list = pre_label
 
-        predict_label_list = predict(table_text)
+        predict_label_list = predict(table_text, model_id=3)
         if predict_label_list:
             if str(label_list) == str(predict_label_list):
-                right_cnt += 1
                 right_id_list.append(str(_id)+"\n")
+                # right_cnt += 1
             else:
-                # cnt = 0
-                # for j in range(len(label_list)):
-                #     row1 = label_list[j]
-                #     row2 = predict_label_list[j]
-                #     if str(row1) != str(row2):
-                #         cnt += 1
-                #     if cnt >= 2:
-                #         error_id_list.append(str(_id)+"\n")
-                #         break
                 error_id_list.append(str(_id)+"\n")
+                # error_cnt += 1
+            if len(label_list) == len(predict_label_list):
+                for j in range(len(label_list)):
+                    for k in range(len(label_list[j])):
+                        if table_text[j][k] == "":
+                            continue
+                        if label_list[j][k] == "1" or predict_label_list[j][k] == "1":
+                            if len(table_text[j][k]) >= 20:
+                                continue
+                            else:
+                                if label_list[j][k] == predict_label_list[j][k]:
+                                    right_cnt += 1
+                                else:
+                                    error_cnt += 1
+            else:
+                print("len(label_list) == len(predict_label_list)", _id,
+                      len(label_list), len(predict_label_list))
 
-    accuracy = right_cnt / len(result_list)
+    accuracy = right_cnt / (right_cnt + error_cnt)
     print(update_user + " accuracy:", accuracy, 'total:', len(result_list))
     print("error_id_list", len(error_id_list))
 
+    save_path = "check_user_result/accuracy.txt"
+    with open(save_path, 'a') as f:
+        f.write(update_user + " "
+                + "表头正确率-" + str(round(accuracy, 2)) + " "
+                + "文章数-" + str(len(result_list)) + " "
+                + "表格总数-" + str(right_cnt + error_cnt) + " "
+                + "表头正确数-" + str(right_cnt)
+                + "\n")
+
     save_path = "check_user_result/"+update_user+"_error.txt"
     with open(save_path, 'w') as f:
         f.writelines(error_id_list)
@@ -101,9 +122,33 @@ def get_single_result(_id):
 
 
 if __name__ == '__main__':
-    # users = ["test9", "test11", "test12", "test20", "test25", "test26", "test27"]
-    users = ["test20", "test27"]
+    # users = ["test9", "test11", "test12", "test25", "test26"]
+    # users = ["test9", "test11", ]
+    # users = ['test12', 'test25']
+    # users = ["test20", "test27"]
     # users = ['test']
+    users = [
+        "test1",
+        "test11",
+        "test12",
+        "test16",
+        "test17",
+        "test19",
+        "test20",
+        "test21",
+        "test22",
+        "test25",
+        "test26",
+        "test27",
+        "test29",
+        "test3",
+        "test7",
+        "test8",
+        "test9",
+    ]
+    users = ["test"]
+    users = ["test12", "test17", "test21", "test22", "test27", ]
+    users = ["test27"]
     acc_list = []
     for user in users:
         acc = user_label_accuracy(user)

BIN
BiddingKG/dl/table_head/checkpoints/best.hdf5


BIN
BiddingKG/dl/table_head/checkpoints/binary_loss/best.hdf5


BIN
BiddingKG/dl/table_head/checkpoints/focal_loss/best.hdf5


+ 7 - 1
BiddingKG/dl/table_head/loss.py

@@ -15,4 +15,10 @@ def focal_loss(gamma=2., alpha=.5):
                                * K.backend.log(K.backend.epsilon()+pt_1))\
                - K.backend.sum((1-alpha) * K.backend.pow(pt_0, gamma)
                                * K.backend.log(1. - pt_0 + K.backend.epsilon()))
-    return f_loss
+    return f_loss
+
+
+def union_loss(gamma=2., alpha=.5):
+    def _loss(y_true, y_pred):
+
+        return focal_loss(gamma, alpha)

+ 237 - 7
BiddingKG/dl/table_head/models/model.py

@@ -1,16 +1,21 @@
 import sys
 import os
+import numpy as np
+from keras.layers import Lambda, Dense, Reshape, Bidirectional, LSTM, Conv2D, BatchNormalization, LeakyReLU, Masking
+from keras_preprocessing.sequence import pad_sequences
+sys.path.append(os.path.dirname(__file__))
 
-from keras.layers import Lambda
-
-sys.path.append(os.path.abspath("../.."))
-from keras import layers, models
+from models.layer_utils import BatchReshape1, BatchReshape2, MyPadding, MySplit, BatchReshape3, \
+    BatchReshape4, BatchReshape5, BatchReshape6
+from keras import layers, models, Sequential
 import keras.backend as K
-from BiddingKG.dl.table_head.models.my_average_pooling import MyAveragePooling1D
-from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
+import tensorflow as tf
+from models.my_average_pooling import MyAveragePooling1D
+from models.self_attention import SeqSelfAttention, MySelfAttention
+from models.u_net import u_net_small
 
 
-def get_model(input_shape, output_shape):
+def model_1(input_shape, output_shape):
     # Input (batch, 10, 60)
     input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
     input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
@@ -67,3 +72,228 @@ def get_model(input_shape, output_shape):
 
     model.summary()
     return model
+
+
+def model_2(input_shape, output_shape):
+    # input_shape = (None, None, 10, 60)
+    # (batch_size, row_num, col_num, character_num, character_embedding)
+    hidden_size = 64
+    attention_size = 64
+    character_num = 10
+    character_embed = 60
+    cell_embed = 1
+
+    # Input
+    input_1 = layers.Input(shape=input_shape, dtype="float32", name="input_1")
+    input_2 = layers.Input(shape=(None, None, None, None), dtype="int32", name="input_2")
+    # batch = tf.shape(_input)[0]
+    height = tf.shape(input_2)[1]
+    width = tf.shape(input_2)[2]
+    pad_height = tf.shape(input_2)[3]
+    pad_width = tf.shape(input_2)[4]
+
+    # print("batch, height, width", batch, height, width)
+
+    # Reshape
+    reshape = BatchReshape1(character_num, character_embed)(input_1)
+    print("model_2_0", reshape)
+
+    # Bi-LSTM + Attention
+    bi_lstm = Bidirectional(LSTM(hidden_size))(reshape)
+    print("model_2_1", bi_lstm)
+    # bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape)
+    # self_attention = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm)
+    # trans = Lambda(lambda x: tf.transpose(x, (0, 2, 1)))(self_attention)
+    # dense = Dense(1, activation='relu')(trans)
+    # squeeze = Lambda(lambda x: tf.squeeze(x, -1))(dense)
+
+    dense = Dense(1, activation="sigmoid")(bi_lstm)
+    print("model_2_2", dense)
+    # reshape = Lambda(batch_reshape, output_shape=(height, width, cell_embed))(dense)
+    reshape = BatchReshape2(cell_embed)([input_1, dense])
+    print("model_2_3", reshape)
+    # squeeze_1 = Lambda(lambda x: K.squeeze(x, axis=-1), name="output_1")(reshape)
+    # print("model_2_4", squeeze)
+
+    # Padding
+    padding = MyPadding(pad_height, pad_width, cell_embed)(reshape)
+    # padding = reshape
+    print("model_2_4", padding)
+
+    # U-Net
+    # u_net = u_net_small(padding)
+    # print("model_2_5", u_net)
+
+    # Conv 5*5
+    conv = Conv2D(1, (5, 5), padding='same')(padding)
+    bn = BatchNormalization()(conv)
+    relu = LeakyReLU(alpha=0.)(bn)
+
+    conv = Conv2D(1, (5, 5), padding='same')(relu)
+    bn = BatchNormalization()(conv)
+    relu = LeakyReLU(alpha=0.)(bn)
+
+    conv = Conv2D(1, (5, 5), padding='same')(relu)
+    bn = BatchNormalization()(conv)
+    relu_1 = LeakyReLU(alpha=0.)(bn)
+
+    # Conv 3*3
+    conv = Conv2D(1, (3, 3), padding='same')(padding)
+    bn = BatchNormalization()(conv)
+    relu = LeakyReLU(alpha=0.)(bn)
+
+    conv = Conv2D(1, (3, 3), padding='same')(relu)
+    bn = BatchNormalization()(conv)
+    relu = LeakyReLU(alpha=0.)(bn)
+
+    conv = Conv2D(1, (3, 3), padding='same')(relu)
+    bn = BatchNormalization()(conv)
+    relu_2 = LeakyReLU(alpha=0.)(bn)
+
+    # Conv 1*1
+    conv = Conv2D(1, (1, 1), padding='same')(padding)
+    bn = BatchNormalization()(conv)
+    relu = LeakyReLU(alpha=0.)(bn)
+
+    conv = Conv2D(1, (1, 1), padding='same')(relu)
+    bn = BatchNormalization()(conv)
+    relu = LeakyReLU(alpha=0.)(bn)
+
+    conv = Conv2D(1, (1, 1), padding='same')(relu)
+    bn = BatchNormalization()(conv)
+    relu_3 = LeakyReLU(alpha=0.)(bn)
+
+    # conv = Conv2D(cell_embed, (3, 3), padding='same')(relu)
+    # bn = BatchNormalization()(conv)
+    # relu_2 = LeakyReLU(alpha=0.)(bn)
+
+    # Merge
+    # print("model_2_5", relu_1, relu_2)
+    merge = layers.Concatenate(axis=-1)([relu_1, relu_2, relu_3])
+    # merge = u_net
+    # merge = relu
+    dense = layers.Dense(1, activation='sigmoid')(merge)
+    squeeze_2 = Lambda(lambda x: K.squeeze(x, axis=-1))(dense)
+
+    # Split
+    split = MySplit(height, width, name="output")(squeeze_2)
+
+    model = models.Model(inputs=[input_1, input_2], outputs=split)
+    model.summary(line_length=120)
+    return model
+
+
+def model_3(input_shape, output_shape):
+    # (batch_size, row_num, col_num, character_num, character_embedding)
+
+    hidden_size = 16
+    attention_size = 2*hidden_size
+    character_num = 20
+    character_embed = 60
+    cell_embed = 2*hidden_size
+    pad_len = 100
+    mask_timestamps = pad_len
+
+    # Input
+    input_1 = layers.Input(shape=input_shape, dtype="float32", name="input_1")
+    input_2 = layers.Input(shape=(None, None, None, None), dtype="int32", name="input_2")
+
+    # Reshape
+    reshape = BatchReshape1(character_num, character_embed)(input_1)
+    print("model_2_0", reshape)
+
+    # Bi-LSTM
+    bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape)
+    bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=False))(bi_lstm)
+    print("model_2_1", bi_lstm)
+
+    # Reshape
+    reshape = BatchReshape2(cell_embed)([input_1, bi_lstm])
+    print("model_2_3", reshape)
+
+    # Rows Reshape
+    reshape_1 = BatchReshape3(cell_embed)(reshape)
+
+    # Cols Reshape
+    trans = Lambda(lambda x: tf.transpose(x, (0, 2, 1, 3)))(reshape)
+    reshape_2 = BatchReshape3(cell_embed)(trans)
+
+    # All boxes Reshape
+    reshape_3 = BatchReshape5(cell_embed)(reshape)
+
+    # Masking
+    # mask_1 = Masking(mask_value=-1, input_shape=(mask_timestamps, cell_embed))(pad_1)
+    # mask_2 = Masking(mask_value=-1, input_shape=(mask_timestamps, cell_embed))(pad_2)
+    # print("model_2_4", mask_1)
+
+    # Padding
+    # pad_1 = MyPadding()
+
+    # Bi-LSTM
+    # bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))
+    # bi_lstm_1 = bi_lstm(reshape_1)
+    # bi_lstm_2 = bi_lstm(reshape_2)
+    bi_lstm_1 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_1)
+    bi_lstm_2 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_2)
+    # bi_lstm_1 = LSTM(2*hidden_size, return_sequences=True)(reshape_1)
+    # print("model_2_4", bi_lstm_1)
+    # bi_lstm_2 = LSTM(2*hidden_size, return_sequences=True)(reshape_2)
+    # self_attention_1 = MySelfAttention(output_dim=attention_size)(bi_lstm_1)
+    # self_attention_2 = MySelfAttention(output_dim=attention_size)(bi_lstm_2)
+
+    # Bi-LSTM + Attention
+    bi_lstm_3 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_3)
+    # bi_lstm_3 = LSTM(2*hidden_size, return_sequences=True)(reshape_3)
+    # self_attention_3 = MySelfAttention(output_dim=attention_size)(bi_lstm_3)
+    # print("model_2_5", bi_lstm_1)
+
+    # Reshape
+    reshape_1 = BatchReshape4(cell_embed)([reshape, bi_lstm_1])
+    reshape_2 = BatchReshape4(cell_embed)([trans, bi_lstm_2])
+    reshape_2 = Lambda(lambda x: tf.transpose(x, (0, 2, 1, 3)))(reshape_2)
+    reshape_3 = BatchReshape6(cell_embed)([reshape, bi_lstm_3])
+    print("model_2_6", reshape_1)
+
+    # Merge
+    merge = layers.Concatenate(axis=-1)([reshape, reshape_1, reshape_2, reshape_3])
+    dense = layers.Dense(hidden_size, activation='relu')(merge)
+    dense = layers.Dense(1, activation='sigmoid')(dense)
+    squeeze = Lambda(lambda x: K.squeeze(x, axis=-1), name="output")(dense)
+
+    model = models.Model(inputs=[input_1, input_2], outputs=squeeze)
+    model.summary(line_length=110)
+    return model
+
+
+def get_model(input_shape, output_shape, model_id):
+    if model_id == 1:
+        return model_1(input_shape, output_shape)
+    elif model_id == 2:
+        return model_2(input_shape, output_shape)
+    elif model_id == 3:
+        return model_3(input_shape, output_shape)
+    else:
+        print("No such model!")
+        raise Exception()
+
+
+def test_layer():
+    model = Sequential()
+    model.add(Masking(mask_value=-1, input_shape=(5, 8)))
+    model.add(Lambda(lambda x: pad_sequences(x, maxlen=100, dtype='float32',
+                                             padding='post', truncating='post',
+                                             value=-1)))
+    model.add(Masking(mask_value=-1, input_shape=(5, 8)))
+    model.add(LSTM(32, return_sequences=True))
+
+    model.compile(optimizer='sgd', loss='mse')
+
+    x = np.zeros([1, 5, 8])
+    print(x.shape)
+    y = np.zeros([1, 5, 32])
+    model.summary()
+    model.fit(x, y, batch_size=32, epochs=10)
+
+
+if __name__ == "__main__":
+    test_layer()

+ 40 - 1
BiddingKG/dl/table_head/models/self_attention.py

@@ -1,5 +1,6 @@
 import keras
 from keras import backend as K
+from keras.layers import Layer
 
 
 class SeqSelfAttention(keras.layers.Layer):
@@ -237,4 +238,42 @@ class SeqSelfAttention(keras.layers.Layer):
 
     @staticmethod
     def get_custom_objects():
-        return {'SeqSelfAttention': SeqSelfAttention}
+        return {'SeqSelfAttention': SeqSelfAttention}
+
+
+class MySelfAttention(Layer):
+    def __init__(self, output_dim, **kwargs):
+        self.output_dim = output_dim
+        super(MySelfAttention, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # inputs.shape = (batch_size, time_steps, seq_len)
+        self.W_Q = self.add_weight(name='W_Q',
+                                   shape=(input_shape[2], self.output_dim),
+                                   initializer='uniform',
+                                   trainable=True)
+        self.W_K = self.add_weight(name='W_K',
+                                   shape=(input_shape[2], self.output_dim),
+                                   initializer='uniform',
+                                   trainable=True)
+        self.W_V = self.add_weight(name='W_V',
+                                   shape=(input_shape[2], self.output_dim),
+                                   initializer='uniform',
+                                   trainable=True)
+
+        super(MySelfAttention, self).build(input_shape)
+
+    def call(self, x, mask=None, **kwargs):
+        _Q = K.dot(x, self.W_Q)
+        _K = K.dot(x, self.W_K)
+        _V = K.dot(x, self.W_V)
+
+        # batch_dot代替K.T
+        _Z = K.batch_dot(_Q, K.permute_dimensions(_K, [0, 2, 1]))
+        _Z = _Z / (self.output_dim**0.5)
+        _Z = K.softmax(_Z)
+        _Z = K.batch_dot(_Z, _V)
+        return _Z
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[0], input_shape[1], self.output_dim

+ 18 - 0
BiddingKG/dl/table_head/post_process.py

@@ -24,3 +24,21 @@ def table_post_process(table_text_list, predict_result, threshold=0.5):
         print("table_post_process 输出label维度与text不一致!")
         table_label_list = []
     return table_label_list
+
+
+def table_post_process_2(table_text_list, predict_result, threshold=0.5):
+    predict_result = predict_result.tolist()[0]
+    predict_list = []
+    for row in predict_result:
+        new_row = []
+        for col in row:
+            if col >= threshold:
+                new_row.append("1")
+            else:
+                new_row.append("0")
+        predict_list.append(new_row)
+
+    if len(predict_list) != len(predict_result):
+        print("table_post_process 输出label维度与text不一致!")
+        predict_list = []
+    return predict_list

+ 22 - 16
BiddingKG/dl/table_head/postgresql2csv.py

@@ -21,23 +21,29 @@ def eval_text_list(table_text):
 def read_postgresql(txt_name, start_id, _time):
     conn = psycopg2.connect(database="table_head_label", user="postgres",
                             password="postgres", host="192.168.2.103", port="5432")
-
-    with open('check_user_result/' + txt_name, "r") as f:
-        id_list = f.readlines()
-    # with open('check_user_result/test27.txt', "r") as f:
-    #     id_list += f.readlines()
-
-    _list = []
-    for _id in id_list:
-        _id = _id[:-1]
-        sql = 'select * from label_table_head_info where id =' + _id
+    row_list = []
+    if txt_name == "":
+        sql = """
+        select * from "label_table_head_info" 
+        where status = 1 and update_time >= '2022-01-17';
+        """
         df = pd.read_sql(sql=sql, con=conn)
-        # df = df[0]
         for index, row in df.iterrows():
-            _list.append([x for x in row])
-    cnt = 0
+            row_list.append([x for x in row])
+    else:
+        with open('check_user_result/' + txt_name, "r") as f:
+            id_list = f.readlines()
+        for _id in id_list:
+            _id = _id[:-1]
+            sql = 'select * from label_table_head_info where id =' + _id
+            df = pd.read_sql(sql=sql, con=conn)
+            # df = df[0]
+            for index, row in df.iterrows():
+                row_list.append([x for x in row])
+        cnt = 0
+
     new_list = []
-    for line in _list:
+    for line in row_list:
         try:
             table_text = eval_text_list(line[2])
         except:
@@ -57,7 +63,7 @@ def read_postgresql(txt_name, start_id, _time):
         label_list = predict(table_text)
         line[3] = str(label_list)
         new_list.append(line)
-    df = pd.DataFrame(_list)
+    df = pd.DataFrame(new_list)
     new_csv_path = "data_new.csv"
 
     df.to_csv(new_csv_path, index=False)
@@ -66,7 +72,7 @@ def read_postgresql(txt_name, start_id, _time):
 
 
 if __name__ == '__main__':
-    new_csv_path = read_postgresql('test20_error.txt', 203995, '2022-01-01 00:00:00')
+    new_csv_path = read_postgresql('test11_error.txt', 206863, '2021-12-31 00:00:00')
     # new_csv_path = read_postgresql('test20_right.txt', 203995, '')
     # df = pd.read_csv('data_new.csv')
     # print(df.iloc[:, 4])

+ 241 - 29
BiddingKG/dl/table_head/pre_process.py

@@ -1,8 +1,10 @@
+import os
 import random
-
+import sys
 import psycopg2
 import numpy as np
-from BiddingKG.dl.common.Utils import embedding_word
+sys.path.append(os.path.dirname(__file__) + "/../")
+from common.Utils import embedding_word, embedding_word_forward
 
 
 def get_sentence_index_list(sentence, dict_path='utils/ppocr_keys_v1.txt'):
@@ -42,22 +44,28 @@ def postgresql_util(sql, limit):
     return all_rows
 
 
-def get_data_from_sql(dim=10):
+def get_data_from_sql(dim=10, whole_table=False, padding=True):
+    sql = """
+    select table_text, pre_label, post_label, id
+    from label_table_head_info
+    where status = 0 and (update_user='test9' or update_user='test1' or update_user='test7' or update_user='test26')
+    ;
+    """
     # sql = """
     # select table_text, pre_label, post_label, id
     # from label_table_head_info
-    # where update_user <> 'test27' and update_user <> 'test20' and table_box_cnt >= 4 and table_box_cnt <= 200
+    # where status = 1 and update_time >= '2022-01-17' and update_time <= '2022-01-22'
     # ;
     # """
-    sql = """
-    select table_text, pre_label, post_label, id
-    from label_table_head_info 
-    where status = 1 and update_time >= '2022-01-17'
-    ;
-    """
 
     result_list = postgresql_util(sql, limit=1000000)
 
+    # 需排除的id
+    with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
+        delete_id_list = eval(f.read())
+    with open(r"C:\Users\Administrator\Desktop\table_delete.txt", "r") as f:
+        delete_id_list += eval(f.read())
+
     all_data_list = []
     all_data_label_list = []
     i = 0
@@ -71,6 +79,10 @@ def get_data_from_sql(dim=10):
         post_label = eval(table[2])
         _id = table[3]
 
+        if _id in delete_id_list:
+            print("pass", _id)
+            continue
+
         # table_text需要特殊处理
         try:
             table_text = table[0]
@@ -84,17 +96,35 @@ def get_data_from_sql(dim=10):
             print("无法识别table_text", _id)
             continue
 
-        # 只有一行的也不要
-        if len(post_label) >= 2:
-            data_list, data_label_list = table_pre_process(table_text, post_label, _id)
-        elif len(pre_label) >= 2:
-            data_list, data_label_list = table_pre_process(table_text, pre_label, _id)
+        if whole_table:
+            if len(post_label) >= 2:
+                data_list, data_label_list = table_pre_process_2(table_text, post_label,
+                                                                 _id, padding=padding)
+            elif len(pre_label) >= 2:
+                data_list, data_label_list = table_pre_process_2(table_text, pre_label,
+                                                                 _id, padding=padding)
+            else:
+                data_list, data_label_list = [], []
         else:
-            data_list, data_label_list = [], []
+            # 只有一行的也不要
+            if len(post_label) >= 2:
+                data_list, data_label_list = table_pre_process(table_text, post_label, _id)
+            elif len(pre_label) >= 2:
+                data_list, data_label_list = table_pre_process(table_text, pre_label, _id)
+            else:
+                data_list, data_label_list = [], []
 
         all_data_list += data_list
         all_data_label_list += data_label_list
 
+    # 按维度大小排序
+    if whole_table:
+        _list = []
+        for data, label in zip(all_data_list, all_data_label_list):
+            _list.append([data, label])
+        _list.sort(key=lambda x: (len(x[0]), len(x[0][0])))
+        all_data_list[:], all_data_label_list[:] = zip(*_list)
+
     print("len(all_data_list)", len(all_data_list))
     return all_data_list, all_data_label_list
 
@@ -206,7 +236,84 @@ def table_pre_process(text_list, label_list, _id, is_train=True):
         return data_list
 
 
-def get_data_from_file(file_type):
+def table_pre_process_2(text_list, label_list, _id, is_train=True, padding=True):
+    """
+    表格处理,整个表格为一个数组,且填充长宽维度
+
+    :param text_list:
+    :param label_list:
+    :param _id:
+    :param is_train:
+    :return:
+    """
+    # 判断表格长宽是否合理
+    row_len = len(text_list)
+    best_row_len = get_best_padding_size(row_len, min_len=8)
+    col_len = len(text_list[0])
+    best_col_len = get_best_padding_size(col_len, min_len=8)
+    if best_row_len is None:
+        if is_train:
+            return [], []
+        else:
+            return []
+    if best_col_len is None:
+        if is_train:
+            return [], []
+        else:
+            return []
+
+    if is_train:
+        if len(text_list) != len(label_list):
+            print("文字单元格与标注单元格数量不匹配!", _id)
+            print("len(text_list)", len(text_list), "len(label_list)", len(label_list))
+            return [], []
+
+        if padding:
+            for i in range(row_len):
+                col_len = len(text_list[i])
+                text_list[i] += [None]*(best_col_len-col_len)
+                if is_train:
+                    label_list[i] += ["0"]*(best_col_len-col_len)
+            text_list += [[None]*best_col_len]*(best_row_len-row_len)
+            if is_train:
+                label_list += [["0"]*best_col_len]*(best_row_len-row_len)
+
+    if is_train:
+        for i in range(len(label_list)):
+            for j in range(len(label_list[i])):
+                label_list[i][j] = int(label_list[i][j])
+        return [text_list], [label_list]
+    else:
+        return [text_list]
+
+
+def get_best_padding_size(axis_len, min_len=3, max_len=300):
+    # sizes = [8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
+    #          128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224,
+    #          232, 240, 248, 256, 264, 272, 280, 288, 296]
+    # sizes = [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57,
+    #          60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99, 102, 105, 108, 111,
+    #          114, 117, 120, 123, 126, 129, 132, 135, 138, 141, 144, 147, 150, 153, 156,
+    #          159, 162, 165, 168, 171, 174, 177, 180, 183, 186, 189, 192, 195, 198, 201,
+    #          204, 207, 210, 213, 216, 219, 222, 225, 228, 231, 234, 237, 240, 243, 246,
+    #          249, 252, 255, 258, 261, 264, 267, 270, 273, 276, 279, 282, 285, 288, 291,
+    #          294, 297]
+    sizes = []
+    for i in range(1, max_len):
+        if i * min_len <= max_len:
+            sizes.append(i * min_len)
+    if axis_len > sizes[-1]:
+        return axis_len
+    best_len = sizes[-1]
+    for height in sizes:
+        if axis_len <= height:
+            best_len = height
+            break
+    # print("get_best_padding_size", axis_len, best_len)
+    return best_len
+
+
+def get_data_from_file(file_type, model_id=1):
     if file_type == 'np':
         data_path = 'train_data/data_3.npy'
         data_label_path = 'train_data/data_label_3.npy'
@@ -215,17 +322,20 @@ def get_data_from_file(file_type):
         array2 = np.load(data_label_path)
         return array1, array2
     elif file_type == 'txt':
-        data_path = 'train_data/data3.txt'
-        data_label_path = 'train_data/data_label3.txt'
-
+        if model_id == 1:
+            data_path = 'train_data/data1.txt'
+            data_label_path = 'train_data/data_label1.txt'
+        elif model_id == 2:
+            data_path = 'train_data/data2.txt'
+            data_label_path = 'train_data/data_label2.txt'
+        elif model_id == 3:
+            data_path = 'train_data/data3.txt'
+            data_label_path = 'train_data/data_label3.txt'
         with open(data_path, 'r') as f:
             data_list = f.readlines()
         with open(data_label_path, 'r') as f:
             data_label_list = f.readlines()
 
-        # for i in range(len(data_list)):
-        #     data_list[i] = eval(data_list[i][:-1])
-        #     data_label_list[i] = eval(data_label_list[i][:-1])
         return data_list, data_label_list
     else:
         print("file type error! only np and txt supported")
@@ -245,18 +355,19 @@ def processed_save_to_np():
     #         f.write(str(line) + "\n")
 
 
-def processed_save_to_txt():
-    list1, list2 = get_data_from_sql()
+def processed_save_to_txt(whole_table=False, padding=True):
+    list1, list2 = get_data_from_sql(whole_table=whole_table, padding=padding)
 
     # 打乱
+    # if not whole_table or not padding:
     zip_list = list(zip(list1, list2))
     random.shuffle(zip_list)
     list1[:], list2[:] = zip(*zip_list)
 
-    with open('train_data/data3.txt', 'w') as f:
+    with open('train_data/data1.txt', 'w') as f:
         for line in list1:
             f.write(str(line) + "\n")
-    with open('train_data/data_label3.txt', 'w') as f:
+    with open('train_data/data_label1.txt', 'w') as f:
         for line in list2:
             f.write(str(line) + "\n")
 
@@ -287,7 +398,7 @@ def my_data_loader(data_list, data_label_list, batch_size, is_train=True):
     data_num = len(data_list)
 
     # 定义Embedding输出
-    output_shape = (6, 10, 60)
+    output_shape = (6, 20, 60)
 
     # batch循环取数据
     i = 0
@@ -349,8 +460,109 @@ def my_data_loader(data_list, data_label_list, batch_size, is_train=True):
                    'input_4': X[3], 'input_5': X[4], 'input_6': X[5], }
 
 
+def my_data_loader_2(table_list, table_label_list, batch_size, is_train=True):
+    pad_len = 0
+
+    table_num = len(table_list)
+    if is_train and batch_size == 1:
+        table_list, table_label_list = get_random(table_list, table_label_list)
+
+    # Embedding shape
+    output_shape = (20, 60)
+
+    # batch循环取数据
+    i = 0
+    last_shape = None
+    while True:
+        new_table_list = []
+        new_table_label_list = []
+        for j in range(batch_size):
+            if i >= table_num:
+                i = 0
+                if is_train:
+                    table_list, table_label_list = get_random(table_list, table_label_list,
+                                                              seed=random.randint(1, 40))
+
+            if type(table_list[i]) != list:
+                table = eval(table_list[i][:-1])
+            else:
+                table = table_list[i]
+
+            if batch_size > 1:
+                if last_shape is None:
+                    last_shape = (len(table), len(table[0]))
+                    continue
+                if (len(table), len(table[0])) != last_shape:
+                    last_shape = (len(table), len(table[0]))
+                    break
+
+            if is_train:
+                table_label = eval(table_label_list[i][:-1])
+
+            # 中文字符映射为Embedding
+            for k in range(len(table)):
+                table[k] = embedding_word_forward(table[k], (len(table[k]),
+                                                     output_shape[0],
+                                                     output_shape[1]))
+            new_table_list.append(table)
+            if is_train:
+                new_table_label_list.append(table_label)
+            i += 1
+        new_table_list = np.array(new_table_list)
+        X = new_table_list
+        if X.shape[-2:] != output_shape:
+            # print("Dimension not match!", X.shape)
+            # print("\n")
+            continue
+
+        # 获取Padding大小
+        pad_height = get_best_padding_size(X.shape[1], pad_len)
+        pad_width = get_best_padding_size(X.shape[2], pad_len)
+        input_2 = np.zeros([1, X.shape[1], X.shape[2], pad_height, pad_width])
+
+        if is_train:
+            new_table_label_list = np.array(new_table_label_list)
+            Y = new_table_label_list
+            # Y = Y.astype(np.float32)
+            # yield {"input_1": X, "input_2": input_2}, \
+            #       {"output_1": Y, "output_2": Y}
+            yield {"input_1": X, "input_2": input_2}, \
+                  {"output": Y}
+        else:
+            yield {"input_1": X, "input_2": input_2}
+
+
+def check_train_data():
+    data_list, label_list = get_data_from_file('txt', model_id=2)
+    for data in data_list:
+        data = eval(data)
+        if len(data) % 8 != 0:
+            print(len(data))
+            print(len(data[0]))
+        for row in data:
+            if len(row) % 8 != 0:
+                print(len(data))
+                print(len(row))
+
+
+def get_random(text_list, label_list, seed=42):
+    random.seed(seed)
+    zip_list = list(zip(text_list, label_list))
+    random.shuffle(zip_list)
+    text_list[:], label_list[:] = zip(*zip_list)
+    return text_list, label_list
+
+
 if __name__ == '__main__':
-    processed_save_to_txt()
+    processed_save_to_txt(whole_table=False, padding=False)
     # data_balance()
 
     # test_embedding()
+    # check_train_data()
+
+    # _list = []
+    # for i in range(1, 100):
+    #     _list.append(i*3)
+    # print(_list)
+
+    # print(get_best_padding_size(9, 5))

Failā izmaiņas netiks attēlotas, jo tās ir par lielu
+ 121 - 82
BiddingKG/dl/table_head/predict.py


+ 57 - 33
BiddingKG/dl/table_head/train.py

@@ -2,24 +2,40 @@ import sys
 import os
 sys.path.append(os.path.abspath("../../.."))
 os.environ['KERAS_BACKEND'] = 'tensorflow'
-from keras.metrics import categorical_accuracy
+from BiddingKG.dl.table_head.models.layer_utils import MyModelCheckpoint
 from BiddingKG.dl.table_head.metrics import precision, recall, f1
 from keras import optimizers, Model
 from BiddingKG.dl.table_head.models.model import get_model
 from BiddingKG.dl.table_head.loss import focal_loss
 from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
-from BiddingKG.dl.table_head.pre_process import get_data_from_file, get_data_from_sql, my_data_loader
+from BiddingKG.dl.table_head.pre_process import get_data_from_file, get_data_from_sql, my_data_loader, my_data_loader_2, \
+    get_random
 from keras import backend as K
 
-
-input_shape = (6, 10, 60)
-output_shape = (1,)
-batch_size = 32
-epochs = 1000
-pretrained_path = "checkpoints/best.hdf5"
-checkpoint_path = "checkpoints/"
-PRETRAINED = True
-CHECKPOINT = False
+model_id = 1
+
+if model_id == 1:
+    input_shape = (6, 20, 60)
+    output_shape = (1,)
+    batch_size = 128
+    epochs = 1000
+    PRETRAINED = True
+    CHECKPOINT = False
+    # 用GPU
+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+else:
+    input_shape = (None, None, 20, 60)
+    output_shape = (None, None)
+    batch_size = 1
+    epochs = 1000
+    PRETRAINED = False
+    CHECKPOINT = False
+    # 用CPU
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+pretrained_path = "checkpoints/" + str(model_id) + "/best.hdf5"
+checkpoint_path = "checkpoints/" + str(model_id) + "/"
 
 
 def train():
@@ -27,22 +43,31 @@ def train():
     print("gpus", K.tensorflow_backend._get_available_gpus())
 
     # Data
-    data_x, data_y = get_data_from_file('txt')
-    # data_x = data_x[:60000]
-    # data_y = data_y[:60000]
+    data_x, data_y = get_data_from_file('txt', model_id=model_id)
     print("finish read data", len(data_x))
 
     # Split -> Train, Test
-    split_size = int(len(data_x)*0.1)
-    test_x, test_y = data_x[:split_size], data_y[:split_size]
-    train_x, train_y = data_x[split_size:], data_y[split_size:]
+    if model_id == 1:
+        split_size = int(len(data_x)*0.1)
+        test_x, test_y = data_x[:split_size], data_y[:split_size]
+        train_x, train_y = data_x[split_size:], data_y[split_size:]
+    else:
+        data_x, data_y = get_random(data_x, data_y)
+        split_size = int(len(data_x)*0.1)
+        test_x, test_y = data_x[:split_size], data_y[:split_size]
+        train_x, train_y = data_x[split_size:], data_y[split_size:]
+    print("len(train_x), len(test_x)", len(train_x), len(test_x))
 
     # Data Loader
-    train_data_loader = my_data_loader(train_x, train_y, batch_size=batch_size)
-    test_data_loader = my_data_loader(test_x, test_y, batch_size=batch_size)
+    if model_id == 1:
+        train_data_loader = my_data_loader(train_x, train_y, batch_size=batch_size)
+        test_data_loader = my_data_loader(test_x, test_y, batch_size=batch_size)
+    else:
+        train_data_loader = my_data_loader_2(train_x, train_y, batch_size=batch_size)
+        test_data_loader = my_data_loader_2(test_x, test_y, batch_size=1)
 
     # Model
-    model = get_model(input_shape, output_shape)
+    model = get_model(input_shape, output_shape, model_id=model_id)
     if PRETRAINED:
         model.load_weights(pretrained_path)
         print("read pretrained model", pretrained_path)
@@ -54,16 +79,20 @@ def train():
     else:
         print("no checkpoint")
 
-    filepath = 'e{epoch:02d}-f1{val_f1:.2f}'
-    checkpoint = ModelCheckpoint(checkpoint_path+filepath+".hdf5", monitor='val_f1',
-                                 verbose=1, save_best_only=True, mode='max')
+    filepath = 'e-{epoch:02d}_f1-{val_f1:.2f}'
+    # filepath = 'e-{epoch:02d}_acc-{val_loss:.2f}'
+    checkpoint = ModelCheckpoint(checkpoint_path+filepath+".hdf5",
+                                 monitor='val_f1',
+                                 verbose=1,
+                                 save_best_only=True,
+                                 mode='max')
 
-    model.compile(optimizer=optimizers.Adam(lr=0.005), loss=focal_loss(),
-    # model.compile(optimizer=optimizers.Adam(lr=0.005), loss='binary_crossentropy',
-                  metrics=['acc',
-                           precision, recall, f1])
+    model.compile(optimizer=optimizers.Adam(lr=0.0005),
+                  loss={"output": focal_loss(3., 0.5)},
+                  # loss_weights={"output": 0.5},
+                  metrics=['acc', precision, recall, f1])
 
-    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.1, patience=5,
+    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.1, patience=10,
                             verbose=1, mode='max', cooldown=0, min_lr=0)
 
     model.fit_generator(train_data_loader,
@@ -73,11 +102,6 @@ def train():
                         validation_steps=max(1, len(test_x) // batch_size),
                         epochs=epochs)
 
-    # model.fit(x=[train_x[0], train_x[1], train_x[2]], y=train_y,
-    #           validation_data=([test_x[0], test_x[1], test_x[2]], test_y),
-    #           epochs=epochs, batch_size=256, shuffle=True,
-    #           callbacks=[checkpoint, rlu])
-
     return model, test_x
 
 

+ 3 - 2
BiddingKG/dl/test/test4.py

@@ -46,7 +46,7 @@ def test(name,content):
 if __name__=="__main__":
     # filename = "比地_52_79929693.html"
     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
-    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\test12354.txt","r",encoding="utf8").read()
+    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div",id="pcontent"))
     # df_a = {"html":[]}
     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
@@ -75,7 +75,8 @@ if __name__=="__main__":
     # '''
     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
     # print(predict("12", content,"打印机"))
-    print(predict("12", text,"打印机"))
+    # content = codecs.open("D:\\Project\\format_conversion_maxcompute\\result.html", "r",encoding="utf8").read()
+    print(predict("12", content,"打印机"))
     # test(12,content)
     # test(12,text)
     print("takes",time.time()-_time1)

Daži faili netika attēloti, jo izmaiņu fails ir pārāk liels