3 gadi atpakaļ · 7553b89e1c
--- a/.idea/compiler.xml
+++ b/.idea/compiler.xml
@@ -1,6 +1,7 @@
 
				 <?xml version="1.0" encoding="UTF-8"?>
			
 
				 <project version="4">
			
 
				   <component name="CompilerConfiguration">
			
 
				+    <option name="BUILD_PROCESS_HEAP_SIZE" value="11000" />
			
 
				     <bytecodeTargetLevel>
			
 
				       <module name="BiddingKG" target="8" />
			
 
				     </bytecodeTargetLevel>
			
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -2,5 +2,13 @@
 
				   <profile version="1.0">
			
 
				     <option name="myName" value="Project Default" />
			
 
				     <inspection_tool class="DuplicatedCode" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
			
 
				+    <inspection_tool class="PyInterpreterInspection" enabled="false" level="WARNING" enabled_by_default="false" />
			
 
				+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
			
 
				+      <option name="ignoredIdentifiers">
			
 
				+        <list>
			
 
				+          <option value="tensorflow.nn.bidirectional_dynamic_rnn" />
			
 
				+        </list>
			
 
				+      </option>
			
 
				+    </inspection_tool>
			
 
				   </profile>
			
 
				 </component>
			
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
 
				 <?xml version="1.0" encoding="UTF-8"?>
			
 
				 <project version="4">
			
 
				-  <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" project-jdk-type="Python SDK" />
			
 
				+  <component name="ProjectRootManager" version="2" languageLevel="JDK_15" project-jdk-name="Python 3.5 (BiddingKG)" project-jdk-type="Python SDK" />
			
 
				   <component name="PythonCompatibilityInspectionAdvertiser">
			
 
				     <option name="version" value="3" />
			
 
				   </component>
			
--- a/BiddingKG.iml
+++ b/BiddingKG.iml
@@ -2,13 +2,13 @@
 
				 <module type="JAVA_MODULE" version="4">
			
 
				   <component name="FacetManager">
			
 
				     <facet type="Python" name="Python">
			
 
				-      <configuration sdkName="Python 3.5 (dl_nlp)" />
			
 
				+      <configuration sdkName="Python 3.5 (BiddingKG)" />
			
 
				     </facet>
			
 
				   </component>
			
 
				   <component name="NewModuleRootManager">
			
 
				     <content url="file://$MODULE_DIR$" />
			
 
				     <orderEntry type="jdk" jdkName="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" jdkType="Python SDK" />
			
 
				     <orderEntry type="sourceFolder" forTests="false" />
			
 
				-    <orderEntry type="library" exported="" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
			
 
				+    <orderEntry type="library" name="Python 3.5 (BiddingKG) interpreter library" level="application" />
			
 
				   </component>
			
 
				 </module>
			
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -686,6 +686,35 @@ def embedding_word(datas,shape):
 
				         out_index += 1
			
 
				     return embed
			
 
				 
			
 
				+
			
 
				+def embedding_word_forward(datas,shape):
			
 
				+    '''
			
 
				+    @summary:查找词汇对应的词向量
			
 
				+    @param:
			
 
				+        datas:词汇的list
			
 
				+        shape:结果的shape
			
 
				+    @return: array,返回对应shape的词嵌入
			
 
				+    '''
			
 
				+    model_w2v = getModel_word()
			
 
				+    embed = np.zeros(shape)
			
 
				+    length = shape[1]
			
 
				+    out_index = 0
			
 
				+    #print(datas)
			
 
				+    for data in datas:
			
 
				+        index = 0
			
 
				+        for item in str(data)[:shape[1]]:
			
 
				+            if index>=length:
			
 
				+                break
			
 
				+            if item in model_w2v.vocab:
			
 
				+                embed[out_index][index] = model_w2v[item]
			
 
				+                index += 1
			
 
				+            else:
			
 
				+                # embed[out_index][index] = model_w2v['unk']
			
 
				+                index += 1
			
 
				+        out_index += 1
			
 
				+    return embed
			
 
				+
			
 
				+
			
 
				 def formEncoding(text,shape=(100,60),expand=False):
			
 
				     embedding = np.zeros(shape)
			
 
				     word_model = getModel_word()
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -8,6 +8,7 @@ import time
 
				 import codecs
			
 
				 
			
 
				 from BiddingKG.dl.ratio.re_ratio import extract_ratio
			
 
				+from BiddingKG.dl.table_head.predict import predict
			
 
				 
			
 
				 sys.setrecursionlimit(1000000)
			
 
				 sys.path.append(os.path.abspath("../.."))
			
@@ -414,6 +415,28 @@ def tableToText(soup):
 
				         
			
 
				         return inner_table,head_list
			
 
				 
			
 
				+    def set_head_model(inner_table):
			
 
				+        for i in range(len(inner_table)):
			
 
				+            for j in range(len(inner_table[i])):
			
 
				+                inner_table[i][j] = inner_table[i][j][0]
			
 
				+
			
 
				+        # 模型预测表头
			
 
				+        predict_list = predict(inner_table)
			
 
				+        with open(r"C:\Users\Administrator\Desktop\table_head_test.txt", "a") as f:
			
 
				+            for i in range(len(predict_list)):
			
 
				+                f.write(str(i) + " " + str(inner_table[i]) + "\n")
			
 
				+                f.write(str(i) + " " + str(predict_list[i]) + "\n")
			
 
				+            f.write("\n")
			
 
				+
			
 
				+        # print("table_list", inner_table)
			
 
				+        # print("predict_list", predict_list)
			
 
				+
			
 
				+        for i in range(len(inner_table)):
			
 
				+            for j in range(len(inner_table[i])):
			
 
				+                inner_table[i][j] = [inner_table[i][j], int(predict_list[i][j])]
			
 
				+        head_list = sliceTable(inner_table)
			
 
				+        return inner_table, head_list
			
 
				+
			
 
				     def setHead_incontext(inner_table,pat_head,fix_value="~~",prob_min=0.5):
			
 
				 
			
 
				         data_x,data_position = getPredictor("form").getModel("context").encode(inner_table)
			
@@ -969,7 +992,8 @@ def tableToText(soup):
 
				         if len(inner_table)>0 and len(inner_table[0])>0:
			
 
				             #inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
			
 
				             #inner_table,head_list = setHead_inline(inner_table)
			
 
				-            inner_table,head_list = setHead_initem(inner_table,pat_head)
			
 
				+            # inner_table, head_list = setHead_initem(inner_table,pat_head)
			
 
				+            inner_table, head_list = set_head_model(inner_table)
			
 
				             # inner_table,head_list = setHead_incontext(inner_table,pat_head)
			
 
				             # print(inner_table)
			
 
				             # for begin in range(len(head_list[:-1])):
			
--- a/BiddingKG/dl/table_head/check_user_label_accuracy.py
+++ b/BiddingKG/dl/table_head/check_user_label_accuracy.py
@@ -8,15 +8,16 @@ def user_label_accuracy(update_user):
 
				         sql = """
			
 
				         select table_text, pre_label, post_label, id
			
 
				         from label_table_head_info 
			
 
				-        where update_user='""" + update_user + "' order by id desc limit 3000"
			
 
				+        where update_user='""" + update_user + "' order by update_time"
			
 
				     else:
			
 
				         sql = """
			
 
				         select table_text, pre_label, post_label, id
			
 
				         from label_table_head_info 
			
 
				-        where update_user='""" + update_user + "' and status = 1 and update_time >= '2022-01-17'"
			
 
				+        where update_user='""" + update_user + "' and status = 1 and update_time >= '2022-01-23'"
			
 
				 
			
 
				     result_list = postgresql_util(sql, limit=1000000)
			
 
				     right_cnt = 0
			
 
				+    error_cnt = 0
			
 
				     error_id_list = []
			
 
				     right_id_list = []
			
 
				     i = 0
			
@@ -24,7 +25,10 @@ def user_label_accuracy(update_user):
 
				     for table in result_list:
			
 
				         i += 1
			
 
				         if i % 1000 == 0:
			
 
				-            print("Loop", i, right_cnt, time.time()-start_time)
			
 
				+            if right_cnt + error_cnt != 0:
			
 
				+                print("Loop", i, right_cnt/(right_cnt+error_cnt), time.time()-start_time)
			
 
				+            else:
			
 
				+                print("Loop", i, time.time()-start_time)
			
 
				             start_time = time.time()
			
 
				 
			
 
				         pre_label = eval(table[1])
			
@@ -49,27 +53,44 @@ def user_label_accuracy(update_user):
 
				         else:
			
 
				             label_list = pre_label
			
 
				 
			
 
				-        predict_label_list = predict(table_text)
			
 
				+        predict_label_list = predict(table_text, model_id=3)
			
 
				         if predict_label_list:
			
 
				             if str(label_list) == str(predict_label_list):
			
 
				-                right_cnt += 1
			
 
				                 right_id_list.append(str(_id)+"\n")
			
 
				+                # right_cnt += 1
			
 
				             else:
			
 
				-                # cnt = 0
			
 
				-                # for j in range(len(label_list)):
			
 
				-                #     row1 = label_list[j]
			
 
				-                #     row2 = predict_label_list[j]
			
 
				-                #     if str(row1) != str(row2):
			
 
				-                #         cnt += 1
			
 
				-                #     if cnt >= 2:
			
 
				-                #         error_id_list.append(str(_id)+"\n")
			
 
				-                #         break
			
 
				                 error_id_list.append(str(_id)+"\n")
			
 
				+                # error_cnt += 1
			
 
				+            if len(label_list) == len(predict_label_list):
			
 
				+                for j in range(len(label_list)):
			
 
				+                    for k in range(len(label_list[j])):
			
 
				+                        if table_text[j][k] == "":
			
 
				+                            continue
			
 
				+                        if label_list[j][k] == "1" or predict_label_list[j][k] == "1":
			
 
				+                            if len(table_text[j][k]) >= 20:
			
 
				+                                continue
			
 
				+                            else:
			
 
				+                                if label_list[j][k] == predict_label_list[j][k]:
			
 
				+                                    right_cnt += 1
			
 
				+                                else:
			
 
				+                                    error_cnt += 1
			
 
				+            else:
			
 
				+                print("len(label_list) == len(predict_label_list)", _id,
			
 
				+                      len(label_list), len(predict_label_list))
			
 
				 
			
 
				-    accuracy = right_cnt / len(result_list)
			
 
				+    accuracy = right_cnt / (right_cnt + error_cnt)
			
 
				     print(update_user + " accuracy:", accuracy, 'total:', len(result_list))
			
 
				     print("error_id_list", len(error_id_list))
			
 
				 
			
 
				+    save_path = "check_user_result/accuracy.txt"
			
 
				+    with open(save_path, 'a') as f:
			
 
				+        f.write(update_user + " "
			
 
				+                + "表头正确率-" + str(round(accuracy, 2)) + " "
			
 
				+                + "文章数-" + str(len(result_list)) + " "
			
 
				+                + "表格总数-" + str(right_cnt + error_cnt) + " "
			
 
				+                + "表头正确数-" + str(right_cnt)
			
 
				+                + "\n")
			
 
				+
			
 
				     save_path = "check_user_result/"+update_user+"_error.txt"
			
 
				     with open(save_path, 'w') as f:
			
 
				         f.writelines(error_id_list)
			
@@ -101,9 +122,33 @@ def get_single_result(_id):
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    # users = ["test9", "test11", "test12", "test20", "test25", "test26", "test27"]
			
 
				-    users = ["test20", "test27"]
			
 
				+    # users = ["test9", "test11", "test12", "test25", "test26"]
			
 
				+    # users = ["test9", "test11", ]
			
 
				+    # users = ['test12', 'test25']
			
 
				+    # users = ["test20", "test27"]
			
 
				     # users = ['test']
			
 
				+    users = [
			
 
				+        "test1",
			
 
				+        "test11",
			
 
				+        "test12",
			
 
				+        "test16",
			
 
				+        "test17",
			
 
				+        "test19",
			
 
				+        "test20",
			
 
				+        "test21",
			
 
				+        "test22",
			
 
				+        "test25",
			
 
				+        "test26",
			
 
				+        "test27",
			
 
				+        "test29",
			
 
				+        "test3",
			
 
				+        "test7",
			
 
				+        "test8",
			
 
				+        "test9",
			
 
				+    ]
			
 
				+    users = ["test"]
			
 
				+    users = ["test12", "test17", "test21", "test22", "test27", ]
			
 
				+    users = ["test27"]
			
 
				     acc_list = []
			
 
				     for user in users:
			
 
				         acc = user_label_accuracy(user)
			
--- a/BiddingKG/dl/table_head/checkpoints/best.hdf5
+++ b/BiddingKG/dl/table_head/checkpoints/best.hdf5
--- a/BiddingKG/dl/table_head/checkpoints/binary_loss/best.hdf5
+++ b/BiddingKG/dl/table_head/checkpoints/binary_loss/best.hdf5
--- a/BiddingKG/dl/table_head/checkpoints/focal_loss/best.hdf5
+++ b/BiddingKG/dl/table_head/checkpoints/focal_loss/best.hdf5
--- a/BiddingKG/dl/table_head/loss.py
+++ b/BiddingKG/dl/table_head/loss.py
@@ -15,4 +15,10 @@ def focal_loss(gamma=2., alpha=.5):
 
				                                * K.backend.log(K.backend.epsilon()+pt_1))\
			
 
				                - K.backend.sum((1-alpha) * K.backend.pow(pt_0, gamma)
			
 
				                                * K.backend.log(1. - pt_0 + K.backend.epsilon()))
			
 
				-    return f_loss
			
 
				+    return f_loss
			
 
				+
			
 
				+
			
 
				+def union_loss(gamma=2., alpha=.5):
			
 
				+    def _loss(y_true, y_pred):
			
 
				+
			
 
				+        return focal_loss(gamma, alpha)
			
--- a/BiddingKG/dl/table_head/models/model.py
+++ b/BiddingKG/dl/table_head/models/model.py
@@ -1,16 +1,21 @@
 
				 import sys
			
 
				 import os
			
 
				+import numpy as np
			
 
				+from keras.layers import Lambda, Dense, Reshape, Bidirectional, LSTM, Conv2D, BatchNormalization, LeakyReLU, Masking
			
 
				+from keras_preprocessing.sequence import pad_sequences
			
 
				+sys.path.append(os.path.dirname(__file__))
			
 
				 
			
 
				-from keras.layers import Lambda
			
 
				-
			
 
				-sys.path.append(os.path.abspath("../.."))
			
 
				-from keras import layers, models
			
 
				+from models.layer_utils import BatchReshape1, BatchReshape2, MyPadding, MySplit, BatchReshape3, \
			
 
				+    BatchReshape4, BatchReshape5, BatchReshape6
			
 
				+from keras import layers, models, Sequential
			
 
				 import keras.backend as K
			
 
				-from BiddingKG.dl.table_head.models.my_average_pooling import MyAveragePooling1D
			
 
				-from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
			
 
				+import tensorflow as tf
			
 
				+from models.my_average_pooling import MyAveragePooling1D
			
 
				+from models.self_attention import SeqSelfAttention, MySelfAttention
			
 
				+from models.u_net import u_net_small
			
 
				 
			
 
				 
			
 
				-def get_model(input_shape, output_shape):
			
 
				+def model_1(input_shape, output_shape):
			
 
				     # Input (batch, 10, 60)
			
 
				     input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
			
 
				     input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
			
@@ -67,3 +72,228 @@ def get_model(input_shape, output_shape):
 
				 
			
 
				     model.summary()
			
 
				     return model
			
 
				+
			
 
				+
			
 
				+def model_2(input_shape, output_shape):
			
 
				+    # input_shape = (None, None, 10, 60)
			
 
				+    # (batch_size, row_num, col_num, character_num, character_embedding)
			
 
				+    hidden_size = 64
			
 
				+    attention_size = 64
			
 
				+    character_num = 10
			
 
				+    character_embed = 60
			
 
				+    cell_embed = 1
			
 
				+
			
 
				+    # Input
			
 
				+    input_1 = layers.Input(shape=input_shape, dtype="float32", name="input_1")
			
 
				+    input_2 = layers.Input(shape=(None, None, None, None), dtype="int32", name="input_2")
			
 
				+    # batch = tf.shape(_input)[0]
			
 
				+    height = tf.shape(input_2)[1]
			
 
				+    width = tf.shape(input_2)[2]
			
 
				+    pad_height = tf.shape(input_2)[3]
			
 
				+    pad_width = tf.shape(input_2)[4]
			
 
				+
			
 
				+    # print("batch, height, width", batch, height, width)
			
 
				+
			
 
				+    # Reshape
			
 
				+    reshape = BatchReshape1(character_num, character_embed)(input_1)
			
 
				+    print("model_2_0", reshape)
			
 
				+
			
 
				+    # Bi-LSTM + Attention
			
 
				+    bi_lstm = Bidirectional(LSTM(hidden_size))(reshape)
			
 
				+    print("model_2_1", bi_lstm)
			
 
				+    # bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape)
			
 
				+    # self_attention = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm)
			
 
				+    # trans = Lambda(lambda x: tf.transpose(x, (0, 2, 1)))(self_attention)
			
 
				+    # dense = Dense(1, activation='relu')(trans)
			
 
				+    # squeeze = Lambda(lambda x: tf.squeeze(x, -1))(dense)
			
 
				+
			
 
				+    dense = Dense(1, activation="sigmoid")(bi_lstm)
			
 
				+    print("model_2_2", dense)
			
 
				+    # reshape = Lambda(batch_reshape, output_shape=(height, width, cell_embed))(dense)
			
 
				+    reshape = BatchReshape2(cell_embed)([input_1, dense])
			
 
				+    print("model_2_3", reshape)
			
 
				+    # squeeze_1 = Lambda(lambda x: K.squeeze(x, axis=-1), name="output_1")(reshape)
			
 
				+    # print("model_2_4", squeeze)
			
 
				+
			
 
				+    # Padding
			
 
				+    padding = MyPadding(pad_height, pad_width, cell_embed)(reshape)
			
 
				+    # padding = reshape
			
 
				+    print("model_2_4", padding)
			
 
				+
			
 
				+    # U-Net
			
 
				+    # u_net = u_net_small(padding)
			
 
				+    # print("model_2_5", u_net)
			
 
				+
			
 
				+    # Conv 5*5
			
 
				+    conv = Conv2D(1, (5, 5), padding='same')(padding)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    conv = Conv2D(1, (5, 5), padding='same')(relu)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    conv = Conv2D(1, (5, 5), padding='same')(relu)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu_1 = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    # Conv 3*3
			
 
				+    conv = Conv2D(1, (3, 3), padding='same')(padding)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    conv = Conv2D(1, (3, 3), padding='same')(relu)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    conv = Conv2D(1, (3, 3), padding='same')(relu)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu_2 = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    # Conv 1*1
			
 
				+    conv = Conv2D(1, (1, 1), padding='same')(padding)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    conv = Conv2D(1, (1, 1), padding='same')(relu)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    conv = Conv2D(1, (1, 1), padding='same')(relu)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu_3 = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    # conv = Conv2D(cell_embed, (3, 3), padding='same')(relu)
			
 
				+    # bn = BatchNormalization()(conv)
			
 
				+    # relu_2 = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    # Merge
			
 
				+    # print("model_2_5", relu_1, relu_2)
			
 
				+    merge = layers.Concatenate(axis=-1)([relu_1, relu_2, relu_3])
			
 
				+    # merge = u_net
			
 
				+    # merge = relu
			
 
				+    dense = layers.Dense(1, activation='sigmoid')(merge)
			
 
				+    squeeze_2 = Lambda(lambda x: K.squeeze(x, axis=-1))(dense)
			
 
				+
			
 
				+    # Split
			
 
				+    split = MySplit(height, width, name="output")(squeeze_2)
			
 
				+
			
 
				+    model = models.Model(inputs=[input_1, input_2], outputs=split)
			
 
				+    model.summary(line_length=120)
			
 
				+    return model
			
 
				+
			
 
				+
			
 
				+def model_3(input_shape, output_shape):
			
 
				+    # (batch_size, row_num, col_num, character_num, character_embedding)
			
 
				+
			
 
				+    hidden_size = 16
			
 
				+    attention_size = 2*hidden_size
			
 
				+    character_num = 20
			
 
				+    character_embed = 60
			
 
				+    cell_embed = 2*hidden_size
			
 
				+    pad_len = 100
			
 
				+    mask_timestamps = pad_len
			
 
				+
			
 
				+    # Input
			
 
				+    input_1 = layers.Input(shape=input_shape, dtype="float32", name="input_1")
			
 
				+    input_2 = layers.Input(shape=(None, None, None, None), dtype="int32", name="input_2")
			
 
				+
			
 
				+    # Reshape
			
 
				+    reshape = BatchReshape1(character_num, character_embed)(input_1)
			
 
				+    print("model_2_0", reshape)
			
 
				+
			
 
				+    # Bi-LSTM
			
 
				+    bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape)
			
 
				+    bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=False))(bi_lstm)
			
 
				+    print("model_2_1", bi_lstm)
			
 
				+
			
 
				+    # Reshape
			
 
				+    reshape = BatchReshape2(cell_embed)([input_1, bi_lstm])
			
 
				+    print("model_2_3", reshape)
			
 
				+
			
 
				+    # Rows Reshape
			
 
				+    reshape_1 = BatchReshape3(cell_embed)(reshape)
			
 
				+
			
 
				+    # Cols Reshape
			
 
				+    trans = Lambda(lambda x: tf.transpose(x, (0, 2, 1, 3)))(reshape)
			
 
				+    reshape_2 = BatchReshape3(cell_embed)(trans)
			
 
				+
			
 
				+    # All boxes Reshape
			
 
				+    reshape_3 = BatchReshape5(cell_embed)(reshape)
			
 
				+
			
 
				+    # Masking
			
 
				+    # mask_1 = Masking(mask_value=-1, input_shape=(mask_timestamps, cell_embed))(pad_1)
			
 
				+    # mask_2 = Masking(mask_value=-1, input_shape=(mask_timestamps, cell_embed))(pad_2)
			
 
				+    # print("model_2_4", mask_1)
			
 
				+
			
 
				+    # Padding
			
 
				+    # pad_1 = MyPadding()
			
 
				+
			
 
				+    # Bi-LSTM
			
 
				+    # bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))
			
 
				+    # bi_lstm_1 = bi_lstm(reshape_1)
			
 
				+    # bi_lstm_2 = bi_lstm(reshape_2)
			
 
				+    bi_lstm_1 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_1)
			
 
				+    bi_lstm_2 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_2)
			
 
				+    # bi_lstm_1 = LSTM(2*hidden_size, return_sequences=True)(reshape_1)
			
 
				+    # print("model_2_4", bi_lstm_1)
			
 
				+    # bi_lstm_2 = LSTM(2*hidden_size, return_sequences=True)(reshape_2)
			
 
				+    # self_attention_1 = MySelfAttention(output_dim=attention_size)(bi_lstm_1)
			
 
				+    # self_attention_2 = MySelfAttention(output_dim=attention_size)(bi_lstm_2)
			
 
				+
			
 
				+    # Bi-LSTM + Attention
			
 
				+    bi_lstm_3 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_3)
			
 
				+    # bi_lstm_3 = LSTM(2*hidden_size, return_sequences=True)(reshape_3)
			
 
				+    # self_attention_3 = MySelfAttention(output_dim=attention_size)(bi_lstm_3)
			
 
				+    # print("model_2_5", bi_lstm_1)
			
 
				+
			
 
				+    # Reshape
			
 
				+    reshape_1 = BatchReshape4(cell_embed)([reshape, bi_lstm_1])
			
 
				+    reshape_2 = BatchReshape4(cell_embed)([trans, bi_lstm_2])
			
 
				+    reshape_2 = Lambda(lambda x: tf.transpose(x, (0, 2, 1, 3)))(reshape_2)
			
 
				+    reshape_3 = BatchReshape6(cell_embed)([reshape, bi_lstm_3])
			
 
				+    print("model_2_6", reshape_1)
			
 
				+
			
 
				+    # Merge
			
 
				+    merge = layers.Concatenate(axis=-1)([reshape, reshape_1, reshape_2, reshape_3])
			
 
				+    dense = layers.Dense(hidden_size, activation='relu')(merge)
			
 
				+    dense = layers.Dense(1, activation='sigmoid')(dense)
			
 
				+    squeeze = Lambda(lambda x: K.squeeze(x, axis=-1), name="output")(dense)
			
 
				+
			
 
				+    model = models.Model(inputs=[input_1, input_2], outputs=squeeze)
			
 
				+    model.summary(line_length=110)
			
 
				+    return model
			
 
				+
			
 
				+
			
 
				+def get_model(input_shape, output_shape, model_id):
			
 
				+    if model_id == 1:
			
 
				+        return model_1(input_shape, output_shape)
			
 
				+    elif model_id == 2:
			
 
				+        return model_2(input_shape, output_shape)
			
 
				+    elif model_id == 3:
			
 
				+        return model_3(input_shape, output_shape)
			
 
				+    else:
			
 
				+        print("No such model!")
			
 
				+        raise Exception()
			
 
				+
			
 
				+
			
 
				+def test_layer():
			
 
				+    model = Sequential()
			
 
				+    model.add(Masking(mask_value=-1, input_shape=(5, 8)))
			
 
				+    model.add(Lambda(lambda x: pad_sequences(x, maxlen=100, dtype='float32',
			
 
				+                                             padding='post', truncating='post',
			
 
				+                                             value=-1)))
			
 
				+    model.add(Masking(mask_value=-1, input_shape=(5, 8)))
			
 
				+    model.add(LSTM(32, return_sequences=True))
			
 
				+
			
 
				+    model.compile(optimizer='sgd', loss='mse')
			
 
				+
			
 
				+    x = np.zeros([1, 5, 8])
			
 
				+    print(x.shape)
			
 
				+    y = np.zeros([1, 5, 32])
			
 
				+    model.summary()
			
 
				+    model.fit(x, y, batch_size=32, epochs=10)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    test_layer()
			
--- a/BiddingKG/dl/table_head/models/self_attention.py
+++ b/BiddingKG/dl/table_head/models/self_attention.py
@@ -1,5 +1,6 @@
 
				 import keras
			
 
				 from keras import backend as K
			
 
				+from keras.layers import Layer
			
 
				 
			
 
				 
			
 
				 class SeqSelfAttention(keras.layers.Layer):
			
@@ -237,4 +238,42 @@ class SeqSelfAttention(keras.layers.Layer):
 
				 
			
 
				     @staticmethod
			
 
				     def get_custom_objects():
			
 
				-        return {'SeqSelfAttention': SeqSelfAttention}
			
 
				+        return {'SeqSelfAttention': SeqSelfAttention}
			
 
				+
			
 
				+
			
 
				+class MySelfAttention(Layer):
			
 
				+    def __init__(self, output_dim, **kwargs):
			
 
				+        self.output_dim = output_dim
			
 
				+        super(MySelfAttention, self).__init__(**kwargs)
			
 
				+
			
 
				+    def build(self, input_shape):
			
 
				+        # inputs.shape = (batch_size, time_steps, seq_len)
			
 
				+        self.W_Q = self.add_weight(name='W_Q',
			
 
				+                                   shape=(input_shape[2], self.output_dim),
			
 
				+                                   initializer='uniform',
			
 
				+                                   trainable=True)
			
 
				+        self.W_K = self.add_weight(name='W_K',
			
 
				+                                   shape=(input_shape[2], self.output_dim),
			
 
				+                                   initializer='uniform',
			
 
				+                                   trainable=True)
			
 
				+        self.W_V = self.add_weight(name='W_V',
			
 
				+                                   shape=(input_shape[2], self.output_dim),
			
 
				+                                   initializer='uniform',
			
 
				+                                   trainable=True)
			
 
				+
			
 
				+        super(MySelfAttention, self).build(input_shape)
			
 
				+
			
 
				+    def call(self, x, mask=None, **kwargs):
			
 
				+        _Q = K.dot(x, self.W_Q)
			
 
				+        _K = K.dot(x, self.W_K)
			
 
				+        _V = K.dot(x, self.W_V)
			
 
				+
			
 
				+        # batch_dot代替K.T
			
 
				+        _Z = K.batch_dot(_Q, K.permute_dimensions(_K, [0, 2, 1]))
			
 
				+        _Z = _Z / (self.output_dim**0.5)
			
 
				+        _Z = K.softmax(_Z)
			
 
				+        _Z = K.batch_dot(_Z, _V)
			
 
				+        return _Z
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return input_shape[0], input_shape[1], self.output_dim
			
--- a/BiddingKG/dl/table_head/post_process.py
+++ b/BiddingKG/dl/table_head/post_process.py
@@ -24,3 +24,21 @@ def table_post_process(table_text_list, predict_result, threshold=0.5):
 
				         print("table_post_process 输出label维度与text不一致!")
			
 
				         table_label_list = []
			
 
				     return table_label_list
			
 
				+
			
 
				+
			
 
				+def table_post_process_2(table_text_list, predict_result, threshold=0.5):
			
 
				+    predict_result = predict_result.tolist()[0]
			
 
				+    predict_list = []
			
 
				+    for row in predict_result:
			
 
				+        new_row = []
			
 
				+        for col in row:
			
 
				+            if col >= threshold:
			
 
				+                new_row.append("1")
			
 
				+            else:
			
 
				+                new_row.append("0")
			
 
				+        predict_list.append(new_row)
			
 
				+
			
 
				+    if len(predict_list) != len(predict_result):
			
 
				+        print("table_post_process 输出label维度与text不一致!")
			
 
				+        predict_list = []
			
 
				+    return predict_list
			
--- a/BiddingKG/dl/table_head/postgresql2csv.py
+++ b/BiddingKG/dl/table_head/postgresql2csv.py
@@ -21,23 +21,29 @@ def eval_text_list(table_text):
 
				 def read_postgresql(txt_name, start_id, _time):
			
 
				     conn = psycopg2.connect(database="table_head_label", user="postgres",
			
 
				                             password="postgres", host="192.168.2.103", port="5432")
			
 
				-
			
 
				-    with open('check_user_result/' + txt_name, "r") as f:
			
 
				-        id_list = f.readlines()
			
 
				-    # with open('check_user_result/test27.txt', "r") as f:
			
 
				-    #     id_list += f.readlines()
			
 
				-
			
 
				-    _list = []
			
 
				-    for _id in id_list:
			
 
				-        _id = _id[:-1]
			
 
				-        sql = 'select * from label_table_head_info where id =' + _id
			
 
				+    row_list = []
			
 
				+    if txt_name == "":
			
 
				+        sql = """
			
 
				+        select * from "label_table_head_info" 
			
 
				+        where status = 1 and update_time >= '2022-01-17';
			
 
				+        """
			
 
				         df = pd.read_sql(sql=sql, con=conn)
			
 
				-        # df = df[0]
			
 
				         for index, row in df.iterrows():
			
 
				-            _list.append([x for x in row])
			
 
				-    cnt = 0
			
 
				+            row_list.append([x for x in row])
			
 
				+    else:
			
 
				+        with open('check_user_result/' + txt_name, "r") as f:
			
 
				+            id_list = f.readlines()
			
 
				+        for _id in id_list:
			
 
				+            _id = _id[:-1]
			
 
				+            sql = 'select * from label_table_head_info where id =' + _id
			
 
				+            df = pd.read_sql(sql=sql, con=conn)
			
 
				+            # df = df[0]
			
 
				+            for index, row in df.iterrows():
			
 
				+                row_list.append([x for x in row])
			
 
				+        cnt = 0
			
 
				+
			
 
				     new_list = []
			
 
				-    for line in _list:
			
 
				+    for line in row_list:
			
 
				         try:
			
 
				             table_text = eval_text_list(line[2])
			
 
				         except:
			
@@ -57,7 +63,7 @@ def read_postgresql(txt_name, start_id, _time):
 
				         label_list = predict(table_text)
			
 
				         line[3] = str(label_list)
			
 
				         new_list.append(line)
			
 
				-    df = pd.DataFrame(_list)
			
 
				+    df = pd.DataFrame(new_list)
			
 
				     new_csv_path = "data_new.csv"
			
 
				 
			
 
				     df.to_csv(new_csv_path, index=False)
			
@@ -66,7 +72,7 @@ def read_postgresql(txt_name, start_id, _time):
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    new_csv_path = read_postgresql('test20_error.txt', 203995, '2022-01-01 00:00:00')
			
 
				+    new_csv_path = read_postgresql('test11_error.txt', 206863, '2021-12-31 00:00:00')
			
 
				     # new_csv_path = read_postgresql('test20_right.txt', 203995, '')
			
 
				     # df = pd.read_csv('data_new.csv')
			
 
				     # print(df.iloc[:, 4])
			
--- a/BiddingKG/dl/table_head/pre_process.py
+++ b/BiddingKG/dl/table_head/pre_process.py
@@ -1,8 +1,10 @@
 
				+import os
			
 
				 import random
			
 
				-
			
 
				+import sys
			
 
				 import psycopg2
			
 
				 import numpy as np
			
 
				-from BiddingKG.dl.common.Utils import embedding_word
			
 
				+sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				+from common.Utils import embedding_word, embedding_word_forward
			
 
				 
			
 
				 
			
 
				 def get_sentence_index_list(sentence, dict_path='utils/ppocr_keys_v1.txt'):
			
@@ -42,22 +44,28 @@ def postgresql_util(sql, limit):
 
				     return all_rows
			
 
				 
			
 
				 
			
 
				-def get_data_from_sql(dim=10):
			
 
				+def get_data_from_sql(dim=10, whole_table=False, padding=True):
			
 
				+    sql = """
			
 
				+    select table_text, pre_label, post_label, id
			
 
				+    from label_table_head_info
			
 
				+    where status = 0 and (update_user='test9' or update_user='test1' or update_user='test7' or update_user='test26')
			
 
				+    ;
			
 
				+    """
			
 
				     # sql = """
			
 
				     # select table_text, pre_label, post_label, id
			
 
				     # from label_table_head_info
			
 
				-    # where update_user <> 'test27' and update_user <> 'test20' and table_box_cnt >= 4 and table_box_cnt <= 200
			
 
				+    # where status = 1 and update_time >= '2022-01-17' and update_time <= '2022-01-22'
			
 
				     # ;
			
 
				     # """
			
 
				-    sql = """
			
 
				-    select table_text, pre_label, post_label, id
			
 
				-    from label_table_head_info 
			
 
				-    where status = 1 and update_time >= '2022-01-17'
			
 
				-    ;
			
 
				-    """
			
 
				 
			
 
				     result_list = postgresql_util(sql, limit=1000000)
			
 
				 
			
 
				+    # 需排除的id
			
 
				+    with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
			
 
				+        delete_id_list = eval(f.read())
			
 
				+    with open(r"C:\Users\Administrator\Desktop\table_delete.txt", "r") as f:
			
 
				+        delete_id_list += eval(f.read())
			
 
				+
			
 
				     all_data_list = []
			
 
				     all_data_label_list = []
			
 
				     i = 0
			
@@ -71,6 +79,10 @@ def get_data_from_sql(dim=10):
 
				         post_label = eval(table[2])
			
 
				         _id = table[3]
			
 
				 
			
 
				+        if _id in delete_id_list:
			
 
				+            print("pass", _id)
			
 
				+            continue
			
 
				+
			
 
				         # table_text需要特殊处理
			
 
				         try:
			
 
				             table_text = table[0]
			
@@ -84,17 +96,35 @@ def get_data_from_sql(dim=10):
 
				             print("无法识别table_text", _id)
			
 
				             continue
			
 
				 
			
 
				-        # 只有一行的也不要
			
 
				-        if len(post_label) >= 2:
			
 
				-            data_list, data_label_list = table_pre_process(table_text, post_label, _id)
			
 
				-        elif len(pre_label) >= 2:
			
 
				-            data_list, data_label_list = table_pre_process(table_text, pre_label, _id)
			
 
				+        if whole_table:
			
 
				+            if len(post_label) >= 2:
			
 
				+                data_list, data_label_list = table_pre_process_2(table_text, post_label,
			
 
				+                                                                 _id, padding=padding)
			
 
				+            elif len(pre_label) >= 2:
			
 
				+                data_list, data_label_list = table_pre_process_2(table_text, pre_label,
			
 
				+                                                                 _id, padding=padding)
			
 
				+            else:
			
 
				+                data_list, data_label_list = [], []
			
 
				         else:
			
 
				-            data_list, data_label_list = [], []
			
 
				+            # 只有一行的也不要
			
 
				+            if len(post_label) >= 2:
			
 
				+                data_list, data_label_list = table_pre_process(table_text, post_label, _id)
			
 
				+            elif len(pre_label) >= 2:
			
 
				+                data_list, data_label_list = table_pre_process(table_text, pre_label, _id)
			
 
				+            else:
			
 
				+                data_list, data_label_list = [], []
			
 
				 
			
 
				         all_data_list += data_list
			
 
				         all_data_label_list += data_label_list
			
 
				 
			
 
				+    # 按维度大小排序
			
 
				+    if whole_table:
			
 
				+        _list = []
			
 
				+        for data, label in zip(all_data_list, all_data_label_list):
			
 
				+            _list.append([data, label])
			
 
				+        _list.sort(key=lambda x: (len(x[0]), len(x[0][0])))
			
 
				+        all_data_list[:], all_data_label_list[:] = zip(*_list)
			
 
				+
			
 
				     print("len(all_data_list)", len(all_data_list))
			
 
				     return all_data_list, all_data_label_list
			
 
				 
			
@@ -206,7 +236,84 @@ def table_pre_process(text_list, label_list, _id, is_train=True):
 
				         return data_list
			
 
				 
			
 
				 
			
 
				-def get_data_from_file(file_type):
			
 
				+def table_pre_process_2(text_list, label_list, _id, is_train=True, padding=True):
			
 
				+    """
			
 
				+    表格处理，整个表格为一个数组，且填充长宽维度
			
 
				+
			
 
				+    :param text_list:
			
 
				+    :param label_list:
			
 
				+    :param _id:
			
 
				+    :param is_train:
			
 
				+    :return:
			
 
				+    """
			
 
				+    # 判断表格长宽是否合理
			
 
				+    row_len = len(text_list)
			
 
				+    best_row_len = get_best_padding_size(row_len, min_len=8)
			
 
				+    col_len = len(text_list[0])
			
 
				+    best_col_len = get_best_padding_size(col_len, min_len=8)
			
 
				+    if best_row_len is None:
			
 
				+        if is_train:
			
 
				+            return [], []
			
 
				+        else:
			
 
				+            return []
			
 
				+    if best_col_len is None:
			
 
				+        if is_train:
			
 
				+            return [], []
			
 
				+        else:
			
 
				+            return []
			
 
				+
			
 
				+    if is_train:
			
 
				+        if len(text_list) != len(label_list):
			
 
				+            print("文字单元格与标注单元格数量不匹配！", _id)
			
 
				+            print("len(text_list)", len(text_list), "len(label_list)", len(label_list))
			
 
				+            return [], []
			
 
				+
			
 
				+        if padding:
			
 
				+            for i in range(row_len):
			
 
				+                col_len = len(text_list[i])
			
 
				+                text_list[i] += [None]*(best_col_len-col_len)
			
 
				+                if is_train:
			
 
				+                    label_list[i] += ["0"]*(best_col_len-col_len)
			
 
				+            text_list += [[None]*best_col_len]*(best_row_len-row_len)
			
 
				+            if is_train:
			
 
				+                label_list += [["0"]*best_col_len]*(best_row_len-row_len)
			
 
				+
			
 
				+    if is_train:
			
 
				+        for i in range(len(label_list)):
			
 
				+            for j in range(len(label_list[i])):
			
 
				+                label_list[i][j] = int(label_list[i][j])
			
 
				+        return [text_list], [label_list]
			
 
				+    else:
			
 
				+        return [text_list]
			
 
				+
			
 
				+
			
 
				+def get_best_padding_size(axis_len, min_len=3, max_len=300):
			
 
				+    # sizes = [8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
			
 
				+    #          128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224,
			
 
				+    #          232, 240, 248, 256, 264, 272, 280, 288, 296]
			
 
				+    # sizes = [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57,
			
 
				+    #          60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99, 102, 105, 108, 111,
			
 
				+    #          114, 117, 120, 123, 126, 129, 132, 135, 138, 141, 144, 147, 150, 153, 156,
			
 
				+    #          159, 162, 165, 168, 171, 174, 177, 180, 183, 186, 189, 192, 195, 198, 201,
			
 
				+    #          204, 207, 210, 213, 216, 219, 222, 225, 228, 231, 234, 237, 240, 243, 246,
			
 
				+    #          249, 252, 255, 258, 261, 264, 267, 270, 273, 276, 279, 282, 285, 288, 291,
			
 
				+    #          294, 297]
			
 
				+    sizes = []
			
 
				+    for i in range(1, max_len):
			
 
				+        if i * min_len <= max_len:
			
 
				+            sizes.append(i * min_len)
			
 
				+    if axis_len > sizes[-1]:
			
 
				+        return axis_len
			
 
				+    best_len = sizes[-1]
			
 
				+    for height in sizes:
			
 
				+        if axis_len <= height:
			
 
				+            best_len = height
			
 
				+            break
			
 
				+    # print("get_best_padding_size", axis_len, best_len)
			
 
				+    return best_len
			
 
				+
			
 
				+
			
 
				+def get_data_from_file(file_type, model_id=1):
			
 
				     if file_type == 'np':
			
 
				         data_path = 'train_data/data_3.npy'
			
 
				         data_label_path = 'train_data/data_label_3.npy'
			
@@ -215,17 +322,20 @@ def get_data_from_file(file_type):
 
				         array2 = np.load(data_label_path)
			
 
				         return array1, array2
			
 
				     elif file_type == 'txt':
			
 
				-        data_path = 'train_data/data3.txt'
			
 
				-        data_label_path = 'train_data/data_label3.txt'
			
 
				-
			
 
				+        if model_id == 1:
			
 
				+            data_path = 'train_data/data1.txt'
			
 
				+            data_label_path = 'train_data/data_label1.txt'
			
 
				+        elif model_id == 2:
			
 
				+            data_path = 'train_data/data2.txt'
			
 
				+            data_label_path = 'train_data/data_label2.txt'
			
 
				+        elif model_id == 3:
			
 
				+            data_path = 'train_data/data3.txt'
			
 
				+            data_label_path = 'train_data/data_label3.txt'
			
 
				         with open(data_path, 'r') as f:
			
 
				             data_list = f.readlines()
			
 
				         with open(data_label_path, 'r') as f:
			
 
				             data_label_list = f.readlines()
			
 
				 
			
 
				-        # for i in range(len(data_list)):
			
 
				-        #     data_list[i] = eval(data_list[i][:-1])
			
 
				-        #     data_label_list[i] = eval(data_label_list[i][:-1])
			
 
				         return data_list, data_label_list
			
 
				     else:
			
 
				         print("file type error! only np and txt supported")
			
@@ -245,18 +355,19 @@ def processed_save_to_np():
 
				     #         f.write(str(line) + "\n")
			
 
				 
			
 
				 
			
 
				-def processed_save_to_txt():
			
 
				-    list1, list2 = get_data_from_sql()
			
 
				+def processed_save_to_txt(whole_table=False, padding=True):
			
 
				+    list1, list2 = get_data_from_sql(whole_table=whole_table, padding=padding)
			
 
				 
			
 
				     # 打乱
			
 
				+    # if not whole_table or not padding:
			
 
				     zip_list = list(zip(list1, list2))
			
 
				     random.shuffle(zip_list)
			
 
				     list1[:], list2[:] = zip(*zip_list)
			
 
				 
			
 
				-    with open('train_data/data3.txt', 'w') as f:
			
 
				+    with open('train_data/data1.txt', 'w') as f:
			
 
				         for line in list1:
			
 
				             f.write(str(line) + "\n")
			
 
				-    with open('train_data/data_label3.txt', 'w') as f:
			
 
				+    with open('train_data/data_label1.txt', 'w') as f:
			
 
				         for line in list2:
			
 
				             f.write(str(line) + "\n")
			
 
				 
			
@@ -287,7 +398,7 @@ def my_data_loader(data_list, data_label_list, batch_size, is_train=True):
 
				     data_num = len(data_list)
			
 
				 
			
 
				     # 定义Embedding输出
			
 
				-    output_shape = (6, 10, 60)
			
 
				+    output_shape = (6, 20, 60)
			
 
				 
			
 
				     # batch循环取数据
			
 
				     i = 0
			
@@ -349,8 +460,109 @@ def my_data_loader(data_list, data_label_list, batch_size, is_train=True):
 
				                    'input_4': X[3], 'input_5': X[4], 'input_6': X[5], }
			
 
				 
			
 
				 
			
 
				+def my_data_loader_2(table_list, table_label_list, batch_size, is_train=True):
			
 
				+    pad_len = 0
			
 
				+
			
 
				+    table_num = len(table_list)
			
 
				+    if is_train and batch_size == 1:
			
 
				+        table_list, table_label_list = get_random(table_list, table_label_list)
			
 
				+
			
 
				+    # Embedding shape
			
 
				+    output_shape = (20, 60)
			
 
				+
			
 
				+    # batch循环取数据
			
 
				+    i = 0
			
 
				+    last_shape = None
			
 
				+    while True:
			
 
				+        new_table_list = []
			
 
				+        new_table_label_list = []
			
 
				+        for j in range(batch_size):
			
 
				+            if i >= table_num:
			
 
				+                i = 0
			
 
				+                if is_train:
			
 
				+                    table_list, table_label_list = get_random(table_list, table_label_list,
			
 
				+                                                              seed=random.randint(1, 40))
			
 
				+
			
 
				+            if type(table_list[i]) != list:
			
 
				+                table = eval(table_list[i][:-1])
			
 
				+            else:
			
 
				+                table = table_list[i]
			
 
				+
			
 
				+            if batch_size > 1:
			
 
				+                if last_shape is None:
			
 
				+                    last_shape = (len(table), len(table[0]))
			
 
				+                    continue
			
 
				+                if (len(table), len(table[0])) != last_shape:
			
 
				+                    last_shape = (len(table), len(table[0]))
			
 
				+                    break
			
 
				+
			
 
				+            if is_train:
			
 
				+                table_label = eval(table_label_list[i][:-1])
			
 
				+
			
 
				+            # 中文字符映射为Embedding
			
 
				+            for k in range(len(table)):
			
 
				+                table[k] = embedding_word_forward(table[k], (len(table[k]),
			
 
				+                                                     output_shape[0],
			
 
				+                                                     output_shape[1]))
			
 
				+            new_table_list.append(table)
			
 
				+            if is_train:
			
 
				+                new_table_label_list.append(table_label)
			
 
				+            i += 1
			
 
				+        new_table_list = np.array(new_table_list)
			
 
				+        X = new_table_list
			
 
				+        if X.shape[-2:] != output_shape:
			
 
				+            # print("Dimension not match!", X.shape)
			
 
				+            # print("\n")
			
 
				+            continue
			
 
				+
			
 
				+        # 获取Padding大小
			
 
				+        pad_height = get_best_padding_size(X.shape[1], pad_len)
			
 
				+        pad_width = get_best_padding_size(X.shape[2], pad_len)
			
 
				+        input_2 = np.zeros([1, X.shape[1], X.shape[2], pad_height, pad_width])
			
 
				+
			
 
				+        if is_train:
			
 
				+            new_table_label_list = np.array(new_table_label_list)
			
 
				+            Y = new_table_label_list
			
 
				+            # Y = Y.astype(np.float32)
			
 
				+            # yield {"input_1": X, "input_2": input_2}, \
			
 
				+            #       {"output_1": Y, "output_2": Y}
			
 
				+            yield {"input_1": X, "input_2": input_2}, \
			
 
				+                  {"output": Y}
			
 
				+        else:
			
 
				+            yield {"input_1": X, "input_2": input_2}
			
 
				+
			
 
				+
			
 
				+def check_train_data():
			
 
				+    data_list, label_list = get_data_from_file('txt', model_id=2)
			
 
				+    for data in data_list:
			
 
				+        data = eval(data)
			
 
				+        if len(data) % 8 != 0:
			
 
				+            print(len(data))
			
 
				+            print(len(data[0]))
			
 
				+        for row in data:
			
 
				+            if len(row) % 8 != 0:
			
 
				+                print(len(data))
			
 
				+                print(len(row))
			
 
				+
			
 
				+
			
 
				+def get_random(text_list, label_list, seed=42):
			
 
				+    random.seed(seed)
			
 
				+    zip_list = list(zip(text_list, label_list))
			
 
				+    random.shuffle(zip_list)
			
 
				+    text_list[:], label_list[:] = zip(*zip_list)
			
 
				+    return text_list, label_list
			
 
				+
			
 
				+
			
 
				 if __name__ == '__main__':
			
 
				-    processed_save_to_txt()
			
 
				+    processed_save_to_txt(whole_table=False, padding=False)
			
 
				     # data_balance()
			
 
				 
			
 
				     # test_embedding()
			
 
				+    # check_train_data()
			
 
				+
			
 
				+    # _list = []
			
 
				+    # for i in range(1, 100):
			
 
				+    #     _list.append(i*3)
			
 
				+    # print(_list)
			
 
				+
			
 
				+    # print(get_best_padding_size(9, 5))
			
--- a/BiddingKG/dl/table_head/predict.py
+++ b/BiddingKG/dl/table_head/predict.py
--- a/BiddingKG/dl/table_head/train.py
+++ b/BiddingKG/dl/table_head/train.py
@@ -2,24 +2,40 @@ import sys
 
				 import os
			
 
				 sys.path.append(os.path.abspath("../../.."))
			
 
				 os.environ['KERAS_BACKEND'] = 'tensorflow'
			
 
				-from keras.metrics import categorical_accuracy
			
 
				+from BiddingKG.dl.table_head.models.layer_utils import MyModelCheckpoint
			
 
				 from BiddingKG.dl.table_head.metrics import precision, recall, f1
			
 
				 from keras import optimizers, Model
			
 
				 from BiddingKG.dl.table_head.models.model import get_model
			
 
				 from BiddingKG.dl.table_head.loss import focal_loss
			
 
				 from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
			
 
				-from BiddingKG.dl.table_head.pre_process import get_data_from_file, get_data_from_sql, my_data_loader
			
 
				+from BiddingKG.dl.table_head.pre_process import get_data_from_file, get_data_from_sql, my_data_loader, my_data_loader_2, \
			
 
				+    get_random
			
 
				 from keras import backend as K
			
 
				 
			
 
				-
			
 
				-input_shape = (6, 10, 60)
			
 
				-output_shape = (1,)
			
 
				-batch_size = 32
			
 
				-epochs = 1000
			
 
				-pretrained_path = "checkpoints/best.hdf5"
			
 
				-checkpoint_path = "checkpoints/"
			
 
				-PRETRAINED = True
			
 
				-CHECKPOINT = False
			
 
				+model_id = 1
			
 
				+
			
 
				+if model_id == 1:
			
 
				+    input_shape = (6, 20, 60)
			
 
				+    output_shape = (1,)
			
 
				+    batch_size = 128
			
 
				+    epochs = 1000
			
 
				+    PRETRAINED = True
			
 
				+    CHECKPOINT = False
			
 
				+    # 用GPU
			
 
				+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
			
 
				+else:
			
 
				+    input_shape = (None, None, 20, 60)
			
 
				+    output_shape = (None, None)
			
 
				+    batch_size = 1
			
 
				+    epochs = 1000
			
 
				+    PRETRAINED = False
			
 
				+    CHECKPOINT = False
			
 
				+    # 用CPU
			
 
				+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
			
 
				+    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
			
 
				+
			
 
				+pretrained_path = "checkpoints/" + str(model_id) + "/best.hdf5"
			
 
				+checkpoint_path = "checkpoints/" + str(model_id) + "/"
			
 
				 
			
 
				 
			
 
				 def train():
			
@@ -27,22 +43,31 @@ def train():
 
				     print("gpus", K.tensorflow_backend._get_available_gpus())
			
 
				 
			
 
				     # Data
			
 
				-    data_x, data_y = get_data_from_file('txt')
			
 
				-    # data_x = data_x[:60000]
			
 
				-    # data_y = data_y[:60000]
			
 
				+    data_x, data_y = get_data_from_file('txt', model_id=model_id)
			
 
				     print("finish read data", len(data_x))
			
 
				 
			
 
				     # Split -> Train, Test
			
 
				-    split_size = int(len(data_x)*0.1)
			
 
				-    test_x, test_y = data_x[:split_size], data_y[:split_size]
			
 
				-    train_x, train_y = data_x[split_size:], data_y[split_size:]
			
 
				+    if model_id == 1:
			
 
				+        split_size = int(len(data_x)*0.1)
			
 
				+        test_x, test_y = data_x[:split_size], data_y[:split_size]
			
 
				+        train_x, train_y = data_x[split_size:], data_y[split_size:]
			
 
				+    else:
			
 
				+        data_x, data_y = get_random(data_x, data_y)
			
 
				+        split_size = int(len(data_x)*0.1)
			
 
				+        test_x, test_y = data_x[:split_size], data_y[:split_size]
			
 
				+        train_x, train_y = data_x[split_size:], data_y[split_size:]
			
 
				+    print("len(train_x), len(test_x)", len(train_x), len(test_x))
			
 
				 
			
 
				     # Data Loader
			
 
				-    train_data_loader = my_data_loader(train_x, train_y, batch_size=batch_size)
			
 
				-    test_data_loader = my_data_loader(test_x, test_y, batch_size=batch_size)
			
 
				+    if model_id == 1:
			
 
				+        train_data_loader = my_data_loader(train_x, train_y, batch_size=batch_size)
			
 
				+        test_data_loader = my_data_loader(test_x, test_y, batch_size=batch_size)
			
 
				+    else:
			
 
				+        train_data_loader = my_data_loader_2(train_x, train_y, batch_size=batch_size)
			
 
				+        test_data_loader = my_data_loader_2(test_x, test_y, batch_size=1)
			
 
				 
			
 
				     # Model
			
 
				-    model = get_model(input_shape, output_shape)
			
 
				+    model = get_model(input_shape, output_shape, model_id=model_id)
			
 
				     if PRETRAINED:
			
 
				         model.load_weights(pretrained_path)
			
 
				         print("read pretrained model", pretrained_path)
			
@@ -54,16 +79,20 @@ def train():
 
				     else:
			
 
				         print("no checkpoint")
			
 
				 
			
 
				-    filepath = 'e{epoch:02d}-f1{val_f1:.2f}'
			
 
				-    checkpoint = ModelCheckpoint(checkpoint_path+filepath+".hdf5", monitor='val_f1',
			
 
				-                                 verbose=1, save_best_only=True, mode='max')
			
 
				+    filepath = 'e-{epoch:02d}_f1-{val_f1:.2f}'
			
 
				+    # filepath = 'e-{epoch:02d}_acc-{val_loss:.2f}'
			
 
				+    checkpoint = ModelCheckpoint(checkpoint_path+filepath+".hdf5",
			
 
				+                                 monitor='val_f1',
			
 
				+                                 verbose=1,
			
 
				+                                 save_best_only=True,
			
 
				+                                 mode='max')
			
 
				 
			
 
				-    model.compile(optimizer=optimizers.Adam(lr=0.005), loss=focal_loss(),
			
 
				-    # model.compile(optimizer=optimizers.Adam(lr=0.005), loss='binary_crossentropy',
			
 
				-                  metrics=['acc',
			
 
				-                           precision, recall, f1])
			
 
				+    model.compile(optimizer=optimizers.Adam(lr=0.0005),
			
 
				+                  loss={"output": focal_loss(3., 0.5)},
			
 
				+                  # loss_weights={"output": 0.5},
			
 
				+                  metrics=['acc', precision, recall, f1])
			
 
				 
			
 
				-    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.1, patience=5,
			
 
				+    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.1, patience=10,
			
 
				                             verbose=1, mode='max', cooldown=0, min_lr=0)
			
 
				 
			
 
				     model.fit_generator(train_data_loader,
			
@@ -73,11 +102,6 @@ def train():
 
				                         validation_steps=max(1, len(test_x) // batch_size),
			
 
				                         epochs=epochs)
			
 
				 
			
 
				-    # model.fit(x=[train_x[0], train_x[1], train_x[2]], y=train_y,
			
 
				-    #           validation_data=([test_x[0], test_x[1], test_x[2]], test_y),
			
 
				-    #           epochs=epochs, batch_size=256, shuffle=True,
			
 
				-    #           callbacks=[checkpoint, rlu])
			
 
				-
			
 
				     return model, test_x
			
 
				 
			
 
				 
			
--- a/BiddingKG/dl/test/test4.py
+++ b/BiddingKG/dl/test/test4.py
@@ -46,7 +46,7 @@ def test(name,content):
 
				 if __name__=="__main__":
			
 
				     # filename = "比地_52_79929693.html"
			
 
				     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
			
 
				-    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\test12354.txt","r",encoding="utf8").read()
			
 
				+    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
			
 
				     content = str(BeautifulSoup(text).find("div",id="pcontent"))
			
 
				     # df_a = {"html":[]}
			
 
				     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
			
@@ -75,7 +75,8 @@ if __name__=="__main__":
 
				     # '''
			
 
				     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
			
 
				     # print(predict("12", content,"打印机"))
			
 
				-    print(predict("12", text,"打印机"))
			
 
				+    # content = codecs.open("D:\\Project\\format_conversion_maxcompute\\result.html", "r",encoding="utf8").read()
			
 
				+    print(predict("12", content,"打印机"))
			
 
				     # test(12,content)
			
 
				     # test(12,text)
			
 
				     print("takes",time.time()-_time1)