3 lat temu · 503218e064
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,8 @@
 
				 /BiddingKG/dl/channel/data/
			
 
				 /BiddingKG/dl/test
			
 
				 node_modules
			
 
				+/BiddingKG/dl/table_head/train_data/
			
 
				+/BiddingKG/dl/table_head/check_user_result/
			
 
				+/BiddingKG/dl/table_head/checkpoints/
			
 
				+/BiddingKG/dl/table_head/data_new.csv
			
 
				+/BiddingKG/dl/table_head/has_table_no_attach.xlsx
			
--- a/.idea/compiler.xml
+++ b/.idea/compiler.xml
@@ -1,6 +1,7 @@
 
				 <?xml version="1.0" encoding="UTF-8"?>
			
 
				 <project version="4">
			
 
				   <component name="CompilerConfiguration">
			
 
				+    <option name="BUILD_PROCESS_HEAP_SIZE" value="11000" />
			
 
				     <bytecodeTargetLevel>
			
 
				       <module name="BiddingKG" target="8" />
			
 
				     </bytecodeTargetLevel>
			
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -2,5 +2,13 @@
 
				   <profile version="1.0">
			
 
				     <option name="myName" value="Project Default" />
			
 
				     <inspection_tool class="DuplicatedCode" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
			
 
				+    <inspection_tool class="PyInterpreterInspection" enabled="false" level="WARNING" enabled_by_default="false" />
			
 
				+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
			
 
				+      <option name="ignoredIdentifiers">
			
 
				+        <list>
			
 
				+          <option value="tensorflow.nn.bidirectional_dynamic_rnn" />
			
 
				+        </list>
			
 
				+      </option>
			
 
				+    </inspection_tool>
			
 
				   </profile>
			
 
				 </component>
			
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
 
				 <?xml version="1.0" encoding="UTF-8"?>
			
 
				 <project version="4">
			
 
				-  <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" project-jdk-type="Python SDK" />
			
 
				+  <component name="ProjectRootManager" version="2" languageLevel="JDK_15" project-jdk-name="Python 3.5 (BiddingKG)" project-jdk-type="Python SDK" />
			
 
				   <component name="PythonCompatibilityInspectionAdvertiser">
			
 
				     <option name="version" value="3" />
			
 
				   </component>
			
--- a/BiddingKG.iml
+++ b/BiddingKG.iml
@@ -2,13 +2,13 @@
 
				 <module type="JAVA_MODULE" version="4">
			
 
				   <component name="FacetManager">
			
 
				     <facet type="Python" name="Python">
			
 
				-      <configuration sdkName="Python 3.5 (dl_nlp)" />
			
 
				+      <configuration sdkName="Python 3.5 (BiddingKG)" />
			
 
				     </facet>
			
 
				   </component>
			
 
				   <component name="NewModuleRootManager">
			
 
				     <content url="file://$MODULE_DIR$" />
			
 
				     <orderEntry type="jdk" jdkName="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" jdkType="Python SDK" />
			
 
				     <orderEntry type="sourceFolder" forTests="false" />
			
 
				-    <orderEntry type="library" exported="" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
			
 
				+    <orderEntry type="library" name="Python 3.5 (BiddingKG) interpreter library" level="application" />
			
 
				   </component>
			
 
				 </module>
			
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -686,6 +686,35 @@ def embedding_word(datas,shape):
 
				         out_index += 1
			
 
				     return embed
			
 
				 
			
 
				+
			
 
				+def embedding_word_forward(datas,shape):
			
 
				+    '''
			
 
				+    @summary:查找词汇对应的词向量
			
 
				+    @param:
			
 
				+        datas:词汇的list
			
 
				+        shape:结果的shape
			
 
				+    @return: array,返回对应shape的词嵌入
			
 
				+    '''
			
 
				+    model_w2v = getModel_word()
			
 
				+    embed = np.zeros(shape)
			
 
				+    length = shape[1]
			
 
				+    out_index = 0
			
 
				+    #print(datas)
			
 
				+    for data in datas:
			
 
				+        index = 0
			
 
				+        for item in str(data)[:shape[1]]:
			
 
				+            if index>=length:
			
 
				+                break
			
 
				+            if item in model_w2v.vocab:
			
 
				+                embed[out_index][index] = model_w2v[item]
			
 
				+                index += 1
			
 
				+            else:
			
 
				+                # embed[out_index][index] = model_w2v['unk']
			
 
				+                index += 1
			
 
				+        out_index += 1
			
 
				+    return embed
			
 
				+
			
 
				+
			
 
				 def formEncoding(text,shape=(100,60),expand=False):
			
 
				     embedding = np.zeros(shape)
			
 
				     word_model = getModel_word()
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -8,6 +8,7 @@ import time
 
				 import codecs
			
 
				 
			
 
				 from BiddingKG.dl.ratio.re_ratio import extract_ratio
			
 
				+from BiddingKG.dl.table_head.predict import predict
			
 
				 
			
 
				 sys.setrecursionlimit(1000000)
			
 
				 sys.path.append(os.path.abspath("../.."))
			
@@ -414,6 +415,28 @@ def tableToText(soup):
 
				         
			
 
				         return inner_table,head_list
			
 
				 
			
 
				+    def set_head_model(inner_table):
			
 
				+        for i in range(len(inner_table)):
			
 
				+            for j in range(len(inner_table[i])):
			
 
				+                inner_table[i][j] = inner_table[i][j][0]
			
 
				+
			
 
				+        # 模型预测表头
			
 
				+        predict_list = predict(inner_table)
			
 
				+        with open(r"C:\Users\Administrator\Desktop\table_head_test.txt", "a") as f:
			
 
				+            for i in range(len(predict_list)):
			
 
				+                f.write(str(i) + " " + str(inner_table[i]) + "\n")
			
 
				+                f.write(str(i) + " " + str(predict_list[i]) + "\n")
			
 
				+            f.write("\n")
			
 
				+
			
 
				+        # print("table_list", inner_table)
			
 
				+        # print("predict_list", predict_list)
			
 
				+
			
 
				+        for i in range(len(inner_table)):
			
 
				+            for j in range(len(inner_table[i])):
			
 
				+                inner_table[i][j] = [inner_table[i][j], int(predict_list[i][j])]
			
 
				+        head_list = sliceTable(inner_table)
			
 
				+        return inner_table, head_list
			
 
				+
			
 
				     def setHead_incontext(inner_table,pat_head,fix_value="~~",prob_min=0.5):
			
 
				 
			
 
				         data_x,data_position = getPredictor("form").getModel("context").encode(inner_table)
			
@@ -969,7 +992,8 @@ def tableToText(soup):
 
				         if len(inner_table)>0 and len(inner_table[0])>0:
			
 
				             #inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
			
 
				             #inner_table,head_list = setHead_inline(inner_table)
			
 
				-            inner_table,head_list = setHead_initem(inner_table,pat_head)
			
 
				+            # inner_table, head_list = setHead_initem(inner_table,pat_head)
			
 
				+            inner_table, head_list = set_head_model(inner_table)
			
 
				             # inner_table,head_list = setHead_incontext(inner_table,pat_head)
			
 
				             # print(inner_table)
			
 
				             # for begin in range(len(head_list[:-1])):
			
--- a/BiddingKG/dl/table_head/checkpoints/best.hdf5
+++ b/BiddingKG/dl/table_head/checkpoints/best.hdf5
--- a/BiddingKG/dl/table_head/check_user_label_accuracy.py
+++ b/BiddingKG/dl/table_head/check_user_label_accuracy.py
@@ -8,15 +8,16 @@ def user_label_accuracy(update_user):
 
				         sql = """
			
 
				         select table_text, pre_label, post_label, id
			
 
				         from label_table_head_info 
			
 
				-        where update_user='""" + update_user + "' order by id desc limit 3000"
			
 
				+        where update_user='""" + update_user + "' order by update_time"
			
 
				     else:
			
 
				         sql = """
			
 
				         select table_text, pre_label, post_label, id
			
 
				         from label_table_head_info 
			
 
				-        where update_user='""" + update_user + "' and status = 1 and update_time >= '2022-01-17'"
			
 
				+        where update_user='""" + update_user + "' and status = 1 and update_time >= '2022-01-23'"
			
 
				 
			
 
				     result_list = postgresql_util(sql, limit=1000000)
			
 
				     right_cnt = 0
			
 
				+    error_cnt = 0
			
 
				     error_id_list = []
			
 
				     right_id_list = []
			
 
				     i = 0
			
@@ -24,7 +25,10 @@ def user_label_accuracy(update_user):
 
				     for table in result_list:
			
 
				         i += 1
			
 
				         if i % 1000 == 0:
			
 
				-            print("Loop", i, right_cnt, time.time()-start_time)
			
 
				+            if right_cnt + error_cnt != 0:
			
 
				+                print("Loop", i, right_cnt/(right_cnt+error_cnt), time.time()-start_time)
			
 
				+            else:
			
 
				+                print("Loop", i, time.time()-start_time)
			
 
				             start_time = time.time()
			
 
				 
			
 
				         pre_label = eval(table[1])
			
@@ -49,27 +53,44 @@ def user_label_accuracy(update_user):
 
				         else:
			
 
				             label_list = pre_label
			
 
				 
			
 
				-        predict_label_list = predict(table_text)
			
 
				+        predict_label_list = predict(table_text, model_id=3)
			
 
				         if predict_label_list:
			
 
				             if str(label_list) == str(predict_label_list):
			
 
				-                right_cnt += 1
			
 
				                 right_id_list.append(str(_id)+"\n")
			
 
				+                # right_cnt += 1
			
 
				             else:
			
 
				-                # cnt = 0
			
 
				-                # for j in range(len(label_list)):
			
 
				-                #     row1 = label_list[j]
			
 
				-                #     row2 = predict_label_list[j]
			
 
				-                #     if str(row1) != str(row2):
			
 
				-                #         cnt += 1
			
 
				-                #     if cnt >= 2:
			
 
				-                #         error_id_list.append(str(_id)+"\n")
			
 
				-                #         break
			
 
				                 error_id_list.append(str(_id)+"\n")
			
 
				+                # error_cnt += 1
			
 
				+            if len(label_list) == len(predict_label_list):
			
 
				+                for j in range(len(label_list)):
			
 
				+                    for k in range(len(label_list[j])):
			
 
				+                        if table_text[j][k] == "":
			
 
				+                            continue
			
 
				+                        if label_list[j][k] == "1" or predict_label_list[j][k] == "1":
			
 
				+                            if len(table_text[j][k]) >= 20:
			
 
				+                                continue
			
 
				+                            else:
			
 
				+                                if label_list[j][k] == predict_label_list[j][k]:
			
 
				+                                    right_cnt += 1
			
 
				+                                else:
			
 
				+                                    error_cnt += 1
			
 
				+            else:
			
 
				+                print("len(label_list) == len(predict_label_list)", _id,
			
 
				+                      len(label_list), len(predict_label_list))
			
 
				 
			
 
				-    accuracy = right_cnt / len(result_list)
			
 
				+    accuracy = right_cnt / (right_cnt + error_cnt)
			
 
				     print(update_user + " accuracy:", accuracy, 'total:', len(result_list))
			
 
				     print("error_id_list", len(error_id_list))
			
 
				 
			
 
				+    save_path = "check_user_result/accuracy.txt"
			
 
				+    with open(save_path, 'a') as f:
			
 
				+        f.write(update_user + " "
			
 
				+                + "表头正确率-" + str(round(accuracy, 2)) + " "
			
 
				+                + "文章数-" + str(len(result_list)) + " "
			
 
				+                + "表格总数-" + str(right_cnt + error_cnt) + " "
			
 
				+                + "表头正确数-" + str(right_cnt)
			
 
				+                + "\n")
			
 
				+
			
 
				     save_path = "check_user_result/"+update_user+"_error.txt"
			
 
				     with open(save_path, 'w') as f:
			
 
				         f.writelines(error_id_list)
			
@@ -101,9 +122,33 @@ def get_single_result(_id):
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    # users = ["test9", "test11", "test12", "test20", "test25", "test26", "test27"]
			
 
				-    users = ["test20", "test27"]
			
 
				+    # users = ["test9", "test11", "test12", "test25", "test26"]
			
 
				+    # users = ["test9", "test11", ]
			
 
				+    # users = ['test12', 'test25']
			
 
				+    # users = ["test20", "test27"]
			
 
				     # users = ['test']
			
 
				+    users = [
			
 
				+        "test1",
			
 
				+        "test11",
			
 
				+        "test12",
			
 
				+        "test16",
			
 
				+        "test17",
			
 
				+        "test19",
			
 
				+        "test20",
			
 
				+        "test21",
			
 
				+        "test22",
			
 
				+        "test25",
			
 
				+        "test26",
			
 
				+        "test27",
			
 
				+        "test29",
			
 
				+        "test3",
			
 
				+        "test7",
			
 
				+        "test8",
			
 
				+        "test9",
			
 
				+    ]
			
 
				+    users = ["test"]
			
 
				+    users = ["test12", "test17", "test21", "test22", "test27", ]
			
 
				+    users = ["test27"]
			
 
				     acc_list = []
			
 
				     for user in users:
			
 
				         acc = user_label_accuracy(user)
			
--- a/BiddingKG/dl/table_head/checkpoints/binary_loss/best.hdf5
+++ b/BiddingKG/dl/table_head/checkpoints/binary_loss/best.hdf5
--- a/BiddingKG/dl/table_head/checkpoints/focal_loss/best.hdf5
+++ b/BiddingKG/dl/table_head/checkpoints/focal_loss/best.hdf5
--- a/BiddingKG/dl/table_head/loss.py
+++ b/BiddingKG/dl/table_head/loss.py
@@ -15,4 +15,10 @@ def focal_loss(gamma=2., alpha=.5):
 
				                                * K.backend.log(K.backend.epsilon()+pt_1))\
			
 
				                - K.backend.sum((1-alpha) * K.backend.pow(pt_0, gamma)
			
 
				                                * K.backend.log(1. - pt_0 + K.backend.epsilon()))
			
 
				-    return f_loss
			
 
				+    return f_loss
			
 
				+
			
 
				+
			
 
				+def union_loss(gamma=2., alpha=.5):
			
 
				+    def _loss(y_true, y_pred):
			
 
				+
			
 
				+        return focal_loss(gamma, alpha)
			
--- a/BiddingKG/dl/table_head/models/layer_utils.py
+++ b/BiddingKG/dl/table_head/models/layer_utils.py
@@ -0,0 +1,272 @@
 
				+import os
			
 
				+import sys
			
 
				+import tensorflow as tf
			
 
				+from keras.callbacks import Callback
			
 
				+from keras.layers import Layer, warnings
			
 
				+import numpy as np
			
 
				+sys.path.append(os.path.dirname(__file__))
			
 
				+from pre_process import get_best_padding_size
			
 
				+
			
 
				+
			
 
				+class BatchReshape1(Layer):
			
 
				+    """
			
 
				+    将表格的行列维度合并到Batch维度中
			
 
				+    (batch, rows, cols, character_num, character_embed) -> (batch*rows*cols, character_num, character_embed)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, character_num, character_embed):
			
 
				+        super(BatchReshape1, self).__init__()
			
 
				+        self.character_num = character_num
			
 
				+        self.character_embed = character_embed
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        batch = tf.shape(inputs)[0]
			
 
				+        height = tf.shape(inputs)[1]
			
 
				+        width = tf.shape(inputs)[2]
			
 
				+
			
 
				+        outputs = tf.reshape(inputs, (batch*height*width,
			
 
				+                                      self.character_num, self.character_embed))
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, self.character_num, self.character_embed
			
 
				+
			
 
				+
			
 
				+class BatchReshape2(Layer):
			
 
				+    """
			
 
				+    将Batch维度中的行列拆分出来
			
 
				+    (batch*rows*cols, cell_embed) -> (batch, rows, cols, cell_embed)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, cell_embed):
			
 
				+        super(BatchReshape2, self).__init__()
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        input1 = inputs[0]
			
 
				+        input2 = inputs[1]
			
 
				+
			
 
				+        batch = tf.shape(input1)[0]
			
 
				+        height = tf.shape(input1)[1]
			
 
				+        width = tf.shape(input1)[2]
			
 
				+
			
 
				+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, None, self.cell_embed
			
 
				+
			
 
				+
			
 
				+class BatchReshape3(Layer):
			
 
				+    """
			
 
				+    将表格的行维度合并到Batch维度中
			
 
				+    (batch, rows, cols, cell_embed) -> (batch*rows, cols, cell_embed)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, cell_embed):
			
 
				+        super(BatchReshape3, self).__init__()
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        batch = tf.shape(inputs)[0]
			
 
				+        height = tf.shape(inputs)[1]
			
 
				+        width = tf.shape(inputs)[2]
			
 
				+
			
 
				+        outputs = tf.reshape(inputs, (batch*height, width, self.cell_embed))
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, self.cell_embed
			
 
				+
			
 
				+
			
 
				+class BatchReshape4(Layer):
			
 
				+    """
			
 
				+    将Batch维度中的行拆出来
			
 
				+    (batch*rows, cols, cell_embed) -> (batch, rows, cols, cell_embed)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, cell_embed):
			
 
				+        super(BatchReshape4, self).__init__()
			
 
				+        self.supports_masking = True
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def compute_mask(self, inputs, mask=None):
			
 
				+        print(mask)
			
 
				+        # if mask[0] is None:
			
 
				+        #     return mask
			
 
				+
			
 
				+        # input1 = inputs[0]
			
 
				+        # input2 = inputs[1]
			
 
				+        # batch = tf.shape(input1)[0]
			
 
				+        # height = tf.shape(input1)[1]
			
 
				+        # width = tf.shape(input1)[2]
			
 
				+        #
			
 
				+        # mask_tensor = tf.reshape(mask[1], (batch, height, width, self.cell_embed))
			
 
				+        return mask
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        input1 = inputs[0]
			
 
				+        input2 = inputs[1]
			
 
				+
			
 
				+        batch = tf.shape(input1)[0]
			
 
				+        height = tf.shape(input1)[1]
			
 
				+        width = tf.shape(input1)[2]
			
 
				+
			
 
				+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, None, self.cell_embed
			
 
				+
			
 
				+
			
 
				+class BatchReshape5(Layer):
			
 
				+    """
			
 
				+    将表格的行维度合并到Batch维度中
			
 
				+    (batch, rows, cols, cell_embed) -> (batch, rows*cols, cell_embed)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, cell_embed):
			
 
				+        super(BatchReshape5, self).__init__()
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        batch = tf.shape(inputs)[0]
			
 
				+        height = tf.shape(inputs)[1]
			
 
				+        width = tf.shape(inputs)[2]
			
 
				+
			
 
				+        outputs = tf.reshape(inputs, (batch, height*width, self.cell_embed))
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, self.cell_embed
			
 
				+
			
 
				+
			
 
				+class BatchReshape6(Layer):
			
 
				+    """
			
 
				+    将Batch维度中的行拆出来
			
 
				+    (batch, rows*cols, cell_embed) -> (batch, rows, cols, cell_embed)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, cell_embed):
			
 
				+        super(BatchReshape6, self).__init__()
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        input1 = inputs[0]
			
 
				+        input2 = inputs[1]
			
 
				+
			
 
				+        batch = tf.shape(input1)[0]
			
 
				+        height = tf.shape(input1)[1]
			
 
				+        width = tf.shape(input1)[2]
			
 
				+
			
 
				+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, None, self.cell_embed
			
 
				+
			
 
				+
			
 
				+class MyPadding(Layer):
			
 
				+    def __init__(self, pad_height, pad_width, cell_embed):
			
 
				+        super(MyPadding, self).__init__()
			
 
				+        self.pad_height = pad_height
			
 
				+        self.pad_width = pad_width
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        batch = tf.shape(inputs)[0]
			
 
				+        height = tf.shape(inputs)[1]
			
 
				+        width = tf.shape(inputs)[2]
			
 
				+
			
 
				+        outputs = tf.pad(inputs, [[0, 0],
			
 
				+                                  [0, self.pad_height - height],
			
 
				+                                  [0, self.pad_width - width],
			
 
				+                                  [0, 0]])
			
 
				+
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, None, self.cell_embed
			
 
				+
			
 
				+
			
 
				+class MySplit(Layer):
			
 
				+    def __init__(self, height, width, **kwargs):
			
 
				+        super(MySplit, self).__init__(**kwargs)
			
 
				+        self.height = height
			
 
				+        self.width = width
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        outputs = inputs[:, 0:self.height, 0:self.width]
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, None
			
 
				+
			
 
				+
			
 
				+class MyModelCheckpoint(Callback):
			
 
				+    def __init__(self, filepath, monitor='val_loss', verbose=0,
			
 
				+                 save_best_only=False, save_weights_only=False,
			
 
				+                 mode='auto', period=1):
			
 
				+        super(MyModelCheckpoint, self).__init__()
			
 
				+        self.monitor = monitor
			
 
				+        self.verbose = verbose
			
 
				+        self.filepath = filepath
			
 
				+        self.save_best_only = save_best_only
			
 
				+        self.save_weights_only = save_weights_only
			
 
				+        self.period = period
			
 
				+        self.epochs_since_last_save = 0
			
 
				+
			
 
				+        if mode not in ['auto', 'min', 'max']:
			
 
				+            warnings.warn('ModelCheckpoint mode %s is unknown, '
			
 
				+                          'fallback to auto mode.' % (mode),
			
 
				+                          RuntimeWarning)
			
 
				+            mode = 'auto'
			
 
				+
			
 
				+        if mode == 'min':
			
 
				+            self.monitor_op = np.less
			
 
				+            self.best = np.Inf
			
 
				+        elif mode == 'max':
			
 
				+            self.monitor_op = np.greater
			
 
				+            self.best = -np.Inf
			
 
				+        else:
			
 
				+            if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):
			
 
				+                self.monitor_op = np.greater
			
 
				+                self.best = -np.Inf
			
 
				+            else:
			
 
				+                self.monitor_op = np.less
			
 
				+                self.best = np.Inf
			
 
				+
			
 
				+    def on_epoch_end(self, epoch, logs=None):
			
 
				+        logs = logs or {}
			
 
				+        self.epochs_since_last_save += 1
			
 
				+        if self.epochs_since_last_save >= self.period:
			
 
				+            self.epochs_since_last_save = 0
			
 
				+            filepath = self.filepath.format(epoch=epoch + 1, **logs)
			
 
				+            if self.save_best_only:
			
 
				+                current = (logs.get(self.monitor[0]) + logs.get(self.monitor[1])) / 2
			
 
				+                if current is None:
			
 
				+                    warnings.warn('Can save best model only with %s available, '
			
 
				+                                  'skipping.' % (self.monitor), RuntimeWarning)
			
 
				+                else:
			
 
				+                    if self.monitor_op(current, self.best):
			
 
				+                        if self.verbose > 0:
			
 
				+                            print('\nEpoch %05d: %s improved from %0.5f to %0.5f,'
			
 
				+                                  ' saving model to %s'
			
 
				+                                  % (epoch + 1, self.monitor, self.best,
			
 
				+                                     current, filepath))
			
 
				+                        self.best = current
			
 
				+                        if self.save_weights_only:
			
 
				+                            self.model.save_weights(filepath, overwrite=True)
			
 
				+                        else:
			
 
				+                            self.model.save(filepath, overwrite=True)
			
 
				+                    else:
			
 
				+                        if self.verbose > 0:
			
 
				+                            print('\nEpoch %05d: %s did not improve from %0.5f' %
			
 
				+                                  (epoch + 1, self.monitor, self.best))
			
 
				+            else:
			
 
				+                if self.verbose > 0:
			
 
				+                    print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
			
 
				+                if self.save_weights_only:
			
 
				+                    self.model.save_weights(filepath, overwrite=True)
			
 
				+                else:
			
 
				+                    self.model.save(filepath, overwrite=True)
			
--- a/BiddingKG/dl/table_head/models/loop_lstm.py
+++ b/BiddingKG/dl/table_head/models/loop_lstm.py
@@ -0,0 +1,232 @@
 
				+import keras
			
 
				+import tensorflow as tf
			
 
				+from keras import models, backend as K
			
 
				+from keras.layers import Layer, Input, Lambda, Concatenate, Dense, LSTM, Bidirectional
			
 
				+from tensorflow.contrib.rnn import LSTMCell
			
 
				+import numpy as np
			
 
				+
			
 
				+from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
			
 
				+from BiddingKG.dl.table_head.models.u_net import u_net_small
			
 
				+
			
 
				+
			
 
				+def attention(inputs, w_omega, b_omega, u_omega, time_major=False):
			
 
				+    if isinstance(inputs, tuple):
			
 
				+        inputs = tf.concat(inputs, 2)
			
 
				+    if time_major:  # (B,T,D) => (T,B,D)
			
 
				+        inputs = tf.transpose(inputs, [1, 0, 2])
			
 
				+    v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)
			
 
				+
			
 
				+    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
			
 
				+    alphas = tf.nn.softmax(vu, name='alphas')  # (B,T) shape
			
 
				+    # the result has (B,D) shape
			
 
				+    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
			
 
				+
			
 
				+    return output, alphas
			
 
				+
			
 
				+
			
 
				+class LoopCell(Layer):
			
 
				+    def __init__(self, hidden_size, attention_size, character_num, character_embed,
			
 
				+                 cell_embed):
			
 
				+        super(LoopCell, self).__init__()
			
 
				+
			
 
				+        # Hyper parameters
			
 
				+        self.hidden_size = hidden_size
			
 
				+        self.attention_size = attention_size
			
 
				+        self.character_num = character_num
			
 
				+        self.character_embed = character_embed
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def build(self, batch_input_shape):
			
 
				+        super(LoopCell, self).build(batch_input_shape)
			
 
				+
			
 
				+        # Trainable parameters
			
 
				+        # Attention
			
 
				+        # self.w_omega = self.add_weight("w_omega", shape=[self.hidden_size*2, self.attention_size],
			
 
				+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
			
 
				+        #                                trainable=True)
			
 
				+        # self.b_omega = self.add_weight("b_omega", shape=[self.attention_size],
			
 
				+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
			
 
				+        #                                trainable=True)
			
 
				+        # self.u_omega = self.add_weight("u_omega", shape=[self.attention_size],
			
 
				+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
			
 
				+        #                                trainable=True)
			
 
				+
			
 
				+        # Bi-LSTM
			
 
				+        # self.forward_cell = LSTMCell(self.hidden_size, forget_bias=1.0, state_is_tuple=True)
			
 
				+        # self.backward_cell = LSTMCell(self.hidden_size, forget_bias=1.0, state_is_tuple=True)
			
 
				+        # self.bi_lism = Bidirectional(LSTM(self.hidden_size, return_sequences=True))
			
 
				+        # self.bi_lism.build(input_shape=(None, self.character_num, self.character_embed))
			
 
				+        # self.trainable_weights += self.bi_lism.trainable_weights
			
 
				+        #
			
 
				+        # self.self_attention = SeqSelfAttention(attention_activation='sigmoid')
			
 
				+        # self.self_attention.build(input_shape=(None, self.character_num, 2*self.hidden_size))
			
 
				+        # self.trainable_weights += self.self_attention.trainable_weights
			
 
				+        # print(self.trainable_weights)
			
 
				+
			
 
				+        # DNN
			
 
				+        # self.w1 = self.add_weight('W1', [2*self.attention_size, self.cell_embed],
			
 
				+        #                           initializer=tf.random_uniform_initializer(-0.25, 0.25),
			
 
				+        #                           trainable=True)
			
 
				+        #
			
 
				+        # self.b1 = self.add_weight('b1', [self.cell_embed],
			
 
				+        #                           initializer=tf.zeros_initializer(),
			
 
				+        #                           trainable=True)
			
 
				+        # self.dense = Dense(self.cell_embed, activation="relu")
			
 
				+        # print(batch_input_shape[0], batch_input_shape[1], batch_input_shape[2])
			
 
				+        # self.dense.build(input_shape=(batch_input_shape[0]*batch_input_shape[1]*batch_input_shape[2],
			
 
				+        #                               2*self.attention_size))
			
 
				+        # self.trainable_weights += self.dense.trainable_weights
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        def fn(x):
			
 
				+            print("fn_0", x)
			
 
				+
			
 
				+            # (batch*height*width, character_num, hidden_size)
			
 
				+            # outputs, last_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=self.forward_cell,
			
 
				+            #                                                        cell_bw=self.backward_cell,
			
 
				+            #                                                        inputs=x,
			
 
				+            #                                                        dtype=tf.float32,
			
 
				+            #                                                        time_major=False)
			
 
				+            # (batch*height*width, character_num, 2*hidden_size)
			
 
				+            # outputs = self.bi_lism(x)
			
 
				+            # print("fn_1", outputs)
			
 
				+
			
 
				+            # (batch*height*width, character_num, 2*hidden_size)
			
 
				+            # outputs = self.self_attention(outputs)
			
 
				+            # print("fn_2", outputs)
			
 
				+
			
 
				+            # (batch*height*width, 2*hidden_size)
			
 
				+            # outputs, _ = attention(outputs, self.w_omega, self.b_omega,
			
 
				+            #                        self.u_omega, time_major=False)
			
 
				+
			
 
				+
			
 
				+            # (batch*height*width, cell_embedding)
			
 
				+            # outputs = tf.nn.xw_plus_b(outputs, self.w1, self.b1)
			
 
				+            # outputs = self.dense(outputs)
			
 
				+            # print("fn_3", outputs)
			
 
				+            return outputs
			
 
				+
			
 
				+        batch = tf.shape(inputs)[0]
			
 
				+        height = tf.shape(inputs)[1]
			
 
				+        width = tf.shape(inputs)[2]
			
 
				+
			
 
				+        # (batch, height*width, character_num(time_step), character_embedding)
			
 
				+        # inputs = tf.reshape(inputs, (tf.shape(inputs)[0],
			
 
				+        #                              height*width,
			
 
				+        #                              inputs.shape[3], inputs.shape[4]))
			
 
				+
			
 
				+        # (batch*height*width, character_num, character_embedding)
			
 
				+        outputs = tf.reshape(inputs, (batch*height*width,
			
 
				+                                      inputs.shape[3], inputs.shape[4]))
			
 
				+
			
 
				+        # (height*width, batch, character_num(time_step), character_embedding)
			
 
				+        # inputs = tf.transpose(inputs, (1, 0, 2, 3))
			
 
				+
			
 
				+        # split height*width, each cell
			
 
				+        # (height*width, batch, cell_embedding)
			
 
				+        # outputs = tf.map_fn(fn=lambda x: fn(x), elems=inputs, dtype=tf.float32)
			
 
				+        # print("loop_lstm_1", outputs)
			
 
				+        # outputs = tf.squeeze(outputs, 0)
			
 
				+
			
 
				+        # (batch*height*width, 2*attention_size)
			
 
				+        # outputs = fn(inputs)
			
 
				+        # print("loop_lstm_2", outputs)
			
 
				+
			
 
				+        # (1, batch*height*width, 2*attention_size)
			
 
				+        # outputs = tf.expand_dims(outputs, 0)
			
 
				+        # print("loop_lstm_3", outputs)
			
 
				+
			
 
				+        # (batch*height*width, cell_embedding)
			
 
				+        # outputs = Dense(self.cell_embed, activation="relu")(outputs)
			
 
				+        # print("loop_lstm_3", outputs)
			
 
				+
			
 
				+        # (batch, height*width, cell_embedding)
			
 
				+        # outputs = tf.transpose(outputs, (1, 0, 2))
			
 
				+        # print("loop_lstm_2", outputs)
			
 
				+
			
 
				+        # (batch, height, width, cell_embedding)
			
 
				+        # outputs = tf.reshape(outputs, (batch, height, width, self.cell_embed))
			
 
				+        # print("loop_lstm_4", outputs)
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, self.character_num, self.character_embed
			
 
				+
			
 
				+
			
 
				+class BatchReshape(Layer):
			
 
				+    def __init__(self, cell_embed):
			
 
				+        super(BatchReshape, self).__init__()
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        input1 = inputs[0]
			
 
				+        input2 = inputs[1]
			
 
				+
			
 
				+        batch = tf.shape(input1)[0]
			
 
				+        height = tf.shape(input1)[1]
			
 
				+        width = tf.shape(input1)[2]
			
 
				+
			
 
				+        # (batch, height, width, cell_embedding)
			
 
				+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
			
 
				+        print("batch_reshape", outputs)
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, None, self.cell_embed
			
 
				+
			
 
				+
			
 
				+# def batch_reshape(x):
			
 
				+#     return K.reshape(x, (batch, height, width, cell_embed))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    input_shape = (16, 8, 10, 60)
			
 
				+    hidden_size = 64
			
 
				+    attention_size = 64
			
 
				+    character_num = 10
			
 
				+    character_embed = 60
			
 
				+    cell_embed = 8
			
 
				+
			
 
				+    # (batch_size, row_num, col_num, character_num, character_embedding)
			
 
				+    X_train = np.random.uniform(0, 1, (10, 16, 8, 10, 60))
			
 
				+    X_test = np.random.uniform(0, 1, (10, 16, 8, 10, 60))
			
 
				+    y_train = np.random.uniform(0, 1, (10, 16, 8))
			
 
				+    y_test = np.random.uniform(0, 1, (10, 16, 8))
			
 
				+
			
 
				+    _input = Input(shape=input_shape, dtype="float32")
			
 
				+    batch = K.shape(_input)[0]
			
 
				+    height = K.shape(_input)[1]
			
 
				+    width = K.shape(_input)[2]
			
 
				+    print(batch, height, width)
			
 
				+
			
 
				+    loop_bi_lstm = LoopCell(hidden_size, attention_size,
			
 
				+                            character_num, character_embed,
			
 
				+                            cell_embed)(_input)
			
 
				+    print("model_2_1", loop_bi_lstm)
			
 
				+    dense = Dense(cell_embed, activation="relu")(loop_bi_lstm)
			
 
				+    print("model_2_2", dense)
			
 
				+    reshape = Lambda(batch_reshape, output_shape=(height, width, cell_embed))(dense)
			
 
				+    print("model_2_3", reshape)
			
 
				+    u_net = u_net_small(loop_bi_lstm)
			
 
				+    merge = Concatenate(axis=-1)([loop_bi_lstm, u_net])
			
 
				+    dense = Dense(LoopCell().cell_embed, activation='relu')(merge)
			
 
				+    dense = Dense(1, activation='sigmoid')(dense)
			
 
				+    squeeze = Lambda(lambda x: K.squeeze(x, axis=-1))(dense)
			
 
				+    model = models.Model(inputs=_input, outputs=squeeze)
			
 
				+    model.summary(line_length=120)
			
 
				+    model.compile(loss='binary_crossentropy', optimizer='adam')
			
 
				+    model.fit(X_train, y_train,
			
 
				+              epochs=2,
			
 
				+              batch_size=1,
			
 
				+              validation_data=(X_test, y_test))
			
 
				+
			
 
				+    # (batch_size, row_num, col_num, character_num, character_embedding)
			
 
				+    X_train = np.random.uniform(0, 1, (5, 32, 24, 10, 60))
			
 
				+    X_test = np.random.uniform(0, 1, (5, 32, 24, 10, 60))
			
 
				+    y_train = np.random.uniform(0, 1, (5, 32, 24))
			
 
				+    y_test = np.random.uniform(0, 1, (5, 32, 24))
			
 
				+
			
 
				+    model.fit(X_train, y_train,
			
 
				+              epochs=2,
			
 
				+              batch_size=1,
			
 
				+              validation_data=(X_test, y_test))
			
--- a/BiddingKG/dl/table_head/models/model.py
+++ b/BiddingKG/dl/table_head/models/model.py
@@ -1,16 +1,21 @@
 
				 import sys
			
 
				 import os
			
 
				+import numpy as np
			
 
				+from keras.layers import Lambda, Dense, Reshape, Bidirectional, LSTM, Conv2D, BatchNormalization, LeakyReLU, Masking
			
 
				+from keras_preprocessing.sequence import pad_sequences
			
 
				+sys.path.append(os.path.dirname(__file__))
			
 
				 
			
 
				-from keras.layers import Lambda
			
 
				-
			
 
				-sys.path.append(os.path.abspath("../.."))
			
 
				-from keras import layers, models
			
 
				+from models.layer_utils import BatchReshape1, BatchReshape2, MyPadding, MySplit, BatchReshape3, \
			
 
				+    BatchReshape4, BatchReshape5, BatchReshape6
			
 
				+from keras import layers, models, Sequential
			
 
				 import keras.backend as K
			
 
				-from BiddingKG.dl.table_head.models.my_average_pooling import MyAveragePooling1D
			
 
				-from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
			
 
				+import tensorflow as tf
			
 
				+from models.my_average_pooling import MyAveragePooling1D
			
 
				+from models.self_attention import SeqSelfAttention, MySelfAttention
			
 
				+from models.u_net import u_net_small
			
 
				 
			
 
				 
			
 
				-def get_model(input_shape, output_shape):
			
 
				+def model_1(input_shape, output_shape):
			
 
				     # Input (batch, 10, 60)
			
 
				     input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
			
 
				     input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
			
@@ -67,3 +72,228 @@ def get_model(input_shape, output_shape):
 
				 
			
 
				     model.summary()
			
 
				     return model
			
 
				+
			
 
				+
			
 
				+def model_2(input_shape, output_shape):
			
 
				+    # input_shape = (None, None, 10, 60)
			
 
				+    # (batch_size, row_num, col_num, character_num, character_embedding)
			
 
				+    hidden_size = 64
			
 
				+    attention_size = 64
			
 
				+    character_num = 10
			
 
				+    character_embed = 60
			
 
				+    cell_embed = 1
			
 
				+
			
 
				+    # Input
			
 
				+    input_1 = layers.Input(shape=input_shape, dtype="float32", name="input_1")
			
 
				+    input_2 = layers.Input(shape=(None, None, None, None), dtype="int32", name="input_2")
			
 
				+    # batch = tf.shape(_input)[0]
			
 
				+    height = tf.shape(input_2)[1]
			
 
				+    width = tf.shape(input_2)[2]
			
 
				+    pad_height = tf.shape(input_2)[3]
			
 
				+    pad_width = tf.shape(input_2)[4]
			
 
				+
			
 
				+    # print("batch, height, width", batch, height, width)
			
 
				+
			
 
				+    # Reshape
			
 
				+    reshape = BatchReshape1(character_num, character_embed)(input_1)
			
 
				+    print("model_2_0", reshape)
			
 
				+
			
 
				+    # Bi-LSTM + Attention
			
 
				+    bi_lstm = Bidirectional(LSTM(hidden_size))(reshape)
			
 
				+    print("model_2_1", bi_lstm)
			
 
				+    # bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape)
			
 
				+    # self_attention = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm)
			
 
				+    # trans = Lambda(lambda x: tf.transpose(x, (0, 2, 1)))(self_attention)
			
 
				+    # dense = Dense(1, activation='relu')(trans)
			
 
				+    # squeeze = Lambda(lambda x: tf.squeeze(x, -1))(dense)
			
 
				+
			
 
				+    dense = Dense(1, activation="sigmoid")(bi_lstm)
			
 
				+    print("model_2_2", dense)
			
 
				+    # reshape = Lambda(batch_reshape, output_shape=(height, width, cell_embed))(dense)
			
 
				+    reshape = BatchReshape2(cell_embed)([input_1, dense])
			
 
				+    print("model_2_3", reshape)
			
 
				+    # squeeze_1 = Lambda(lambda x: K.squeeze(x, axis=-1), name="output_1")(reshape)
			
 
				+    # print("model_2_4", squeeze)
			
 
				+
			
 
				+    # Padding
			
 
				+    padding = MyPadding(pad_height, pad_width, cell_embed)(reshape)
			
 
				+    # padding = reshape
			
 
				+    print("model_2_4", padding)
			
 
				+
			
 
				+    # U-Net
			
 
				+    # u_net = u_net_small(padding)
			
 
				+    # print("model_2_5", u_net)
			
 
				+
			
 
				+    # Conv 5*5
			
 
				+    conv = Conv2D(1, (5, 5), padding='same')(padding)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    conv = Conv2D(1, (5, 5), padding='same')(relu)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    conv = Conv2D(1, (5, 5), padding='same')(relu)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu_1 = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    # Conv 3*3
			
 
				+    conv = Conv2D(1, (3, 3), padding='same')(padding)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    conv = Conv2D(1, (3, 3), padding='same')(relu)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    conv = Conv2D(1, (3, 3), padding='same')(relu)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu_2 = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    # Conv 1*1
			
 
				+    conv = Conv2D(1, (1, 1), padding='same')(padding)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    conv = Conv2D(1, (1, 1), padding='same')(relu)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    conv = Conv2D(1, (1, 1), padding='same')(relu)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu_3 = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    # conv = Conv2D(cell_embed, (3, 3), padding='same')(relu)
			
 
				+    # bn = BatchNormalization()(conv)
			
 
				+    # relu_2 = LeakyReLU(alpha=0.)(bn)
			
 
				+
			
 
				+    # Merge
			
 
				+    # print("model_2_5", relu_1, relu_2)
			
 
				+    merge = layers.Concatenate(axis=-1)([relu_1, relu_2, relu_3])
			
 
				+    # merge = u_net
			
 
				+    # merge = relu
			
 
				+    dense = layers.Dense(1, activation='sigmoid')(merge)
			
 
				+    squeeze_2 = Lambda(lambda x: K.squeeze(x, axis=-1))(dense)
			
 
				+
			
 
				+    # Split
			
 
				+    split = MySplit(height, width, name="output")(squeeze_2)
			
 
				+
			
 
				+    model = models.Model(inputs=[input_1, input_2], outputs=split)
			
 
				+    model.summary(line_length=120)
			
 
				+    return model
			
 
				+
			
 
				+
			
 
				+def model_3(input_shape, output_shape):
			
 
				+    # (batch_size, row_num, col_num, character_num, character_embedding)
			
 
				+
			
 
				+    hidden_size = 16
			
 
				+    attention_size = 2*hidden_size
			
 
				+    character_num = 20
			
 
				+    character_embed = 60
			
 
				+    cell_embed = 2*hidden_size
			
 
				+    pad_len = 100
			
 
				+    mask_timestamps = pad_len
			
 
				+
			
 
				+    # Input
			
 
				+    input_1 = layers.Input(shape=input_shape, dtype="float32", name="input_1")
			
 
				+    input_2 = layers.Input(shape=(None, None, None, None), dtype="int32", name="input_2")
			
 
				+
			
 
				+    # Reshape
			
 
				+    reshape = BatchReshape1(character_num, character_embed)(input_1)
			
 
				+    print("model_2_0", reshape)
			
 
				+
			
 
				+    # Bi-LSTM
			
 
				+    bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape)
			
 
				+    bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=False))(bi_lstm)
			
 
				+    print("model_2_1", bi_lstm)
			
 
				+
			
 
				+    # Reshape
			
 
				+    reshape = BatchReshape2(cell_embed)([input_1, bi_lstm])
			
 
				+    print("model_2_3", reshape)
			
 
				+
			
 
				+    # Rows Reshape
			
 
				+    reshape_1 = BatchReshape3(cell_embed)(reshape)
			
 
				+
			
 
				+    # Cols Reshape
			
 
				+    trans = Lambda(lambda x: tf.transpose(x, (0, 2, 1, 3)))(reshape)
			
 
				+    reshape_2 = BatchReshape3(cell_embed)(trans)
			
 
				+
			
 
				+    # All boxes Reshape
			
 
				+    reshape_3 = BatchReshape5(cell_embed)(reshape)
			
 
				+
			
 
				+    # Masking
			
 
				+    # mask_1 = Masking(mask_value=-1, input_shape=(mask_timestamps, cell_embed))(pad_1)
			
 
				+    # mask_2 = Masking(mask_value=-1, input_shape=(mask_timestamps, cell_embed))(pad_2)
			
 
				+    # print("model_2_4", mask_1)
			
 
				+
			
 
				+    # Padding
			
 
				+    # pad_1 = MyPadding()
			
 
				+
			
 
				+    # Bi-LSTM
			
 
				+    # bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))
			
 
				+    # bi_lstm_1 = bi_lstm(reshape_1)
			
 
				+    # bi_lstm_2 = bi_lstm(reshape_2)
			
 
				+    bi_lstm_1 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_1)
			
 
				+    bi_lstm_2 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_2)
			
 
				+    # bi_lstm_1 = LSTM(2*hidden_size, return_sequences=True)(reshape_1)
			
 
				+    # print("model_2_4", bi_lstm_1)
			
 
				+    # bi_lstm_2 = LSTM(2*hidden_size, return_sequences=True)(reshape_2)
			
 
				+    # self_attention_1 = MySelfAttention(output_dim=attention_size)(bi_lstm_1)
			
 
				+    # self_attention_2 = MySelfAttention(output_dim=attention_size)(bi_lstm_2)
			
 
				+
			
 
				+    # Bi-LSTM + Attention
			
 
				+    bi_lstm_3 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_3)
			
 
				+    # bi_lstm_3 = LSTM(2*hidden_size, return_sequences=True)(reshape_3)
			
 
				+    # self_attention_3 = MySelfAttention(output_dim=attention_size)(bi_lstm_3)
			
 
				+    # print("model_2_5", bi_lstm_1)
			
 
				+
			
 
				+    # Reshape
			
 
				+    reshape_1 = BatchReshape4(cell_embed)([reshape, bi_lstm_1])
			
 
				+    reshape_2 = BatchReshape4(cell_embed)([trans, bi_lstm_2])
			
 
				+    reshape_2 = Lambda(lambda x: tf.transpose(x, (0, 2, 1, 3)))(reshape_2)
			
 
				+    reshape_3 = BatchReshape6(cell_embed)([reshape, bi_lstm_3])
			
 
				+    print("model_2_6", reshape_1)
			
 
				+
			
 
				+    # Merge
			
 
				+    merge = layers.Concatenate(axis=-1)([reshape, reshape_1, reshape_2, reshape_3])
			
 
				+    dense = layers.Dense(hidden_size, activation='relu')(merge)
			
 
				+    dense = layers.Dense(1, activation='sigmoid')(dense)
			
 
				+    squeeze = Lambda(lambda x: K.squeeze(x, axis=-1), name="output")(dense)
			
 
				+
			
 
				+    model = models.Model(inputs=[input_1, input_2], outputs=squeeze)
			
 
				+    model.summary(line_length=110)
			
 
				+    return model
			
 
				+
			
 
				+
			
 
				+def get_model(input_shape, output_shape, model_id):
			
 
				+    if model_id == 1:
			
 
				+        return model_1(input_shape, output_shape)
			
 
				+    elif model_id == 2:
			
 
				+        return model_2(input_shape, output_shape)
			
 
				+    elif model_id == 3:
			
 
				+        return model_3(input_shape, output_shape)
			
 
				+    else:
			
 
				+        print("No such model!")
			
 
				+        raise Exception()
			
 
				+
			
 
				+
			
 
				+def test_layer():
			
 
				+    model = Sequential()
			
 
				+    model.add(Masking(mask_value=-1, input_shape=(5, 8)))
			
 
				+    model.add(Lambda(lambda x: pad_sequences(x, maxlen=100, dtype='float32',
			
 
				+                                             padding='post', truncating='post',
			
 
				+                                             value=-1)))
			
 
				+    model.add(Masking(mask_value=-1, input_shape=(5, 8)))
			
 
				+    model.add(LSTM(32, return_sequences=True))
			
 
				+
			
 
				+    model.compile(optimizer='sgd', loss='mse')
			
 
				+
			
 
				+    x = np.zeros([1, 5, 8])
			
 
				+    print(x.shape)
			
 
				+    y = np.zeros([1, 5, 32])
			
 
				+    model.summary()
			
 
				+    model.fit(x, y, batch_size=32, epochs=10)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    test_layer()
			
--- a/BiddingKG/dl/table_head/models/model_2.py
+++ b/BiddingKG/dl/table_head/models/model_2.py
@@ -0,0 +1,58 @@
 
				+import sys
			
 
				+import os
			
 
				+sys.path.append(os.path.abspath("../.."))
			
 
				+from keras import layers, models
			
 
				+import tensorflow as tf
			
 
				+from BiddingKG.dl.table_head.models.my_average_pooling import MyAveragePooling1D
			
 
				+from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
			
 
				+
			
 
				+
			
 
				+def get_model(input_shape, output_shape):
			
 
				+    # Input
			
 
				+    input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
			
 
				+    input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
			
 
				+    input_3 = layers.Input(shape=input_shape[1:], dtype="float32")
			
 
				+    input_4 = layers.Input(shape=input_shape[1:], dtype="float32")
			
 
				+    input_5 = layers.Input(shape=input_shape[1:], dtype="float32")
			
 
				+    input_6 = layers.Input(shape=input_shape[1:], dtype="float32")
			
 
				+
			
 
				+    # Bi-LSTM
			
 
				+    bi_lstm_1 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_1)
			
 
				+    bi_lstm_2 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_2)
			
 
				+    bi_lstm_3 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_3)
			
 
				+    bi_lstm_4 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_4)
			
 
				+    bi_lstm_5 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_5)
			
 
				+    bi_lstm_6 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_6)
			
 
				+
			
 
				+    # Self-Attention
			
 
				+    self_attention_1 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_1)
			
 
				+    self_attention_2 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_2)
			
 
				+    self_attention_3 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_3)
			
 
				+    self_attention_4 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_4)
			
 
				+    self_attention_5 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_5)
			
 
				+    self_attention_6 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_6)
			
 
				+
			
 
				+    # Concat
			
 
				+    concat_1 = layers.concatenate([self_attention_1, self_attention_2, self_attention_3])
			
 
				+    concat_2 = layers.concatenate([self_attention_4, self_attention_5, self_attention_6])
			
 
				+
			
 
				+    # Dense + Sigmoid
			
 
				+    dense_1 = layers.Dense(output_shape[0], activation="sigmoid")(concat_1)
			
 
				+    dense_2 = layers.Dense(output_shape[0], activation="sigmoid")(concat_2)
			
 
				+
			
 
				+    # mask mean pooling
			
 
				+    pool_1 = MyAveragePooling1D(axis=1)(dense_1)
			
 
				+    pool_2 = MyAveragePooling1D(axis=1)(dense_2)
			
 
				+
			
 
				+    # Concat
			
 
				+    concat = layers.concatenate([pool_1, pool_2])
			
 
				+
			
 
				+    # Dense
			
 
				+    output = layers.Dense(10)(concat)
			
 
				+    output = layers.Dense(1, activation="sigmoid", name='output')(output)
			
 
				+
			
 
				+    model = models.Model(inputs=[input_1, input_2, input_3, input_4, input_5, input_6],
			
 
				+                         outputs=output)
			
 
				+
			
 
				+    model.summary()
			
 
				+    return model
			
--- a/BiddingKG/dl/table_head/models/self_attention.py
+++ b/BiddingKG/dl/table_head/models/self_attention.py
@@ -1,5 +1,6 @@
 
				 import keras
			
 
				 from keras import backend as K
			
 
				+from keras.layers import Layer
			
 
				 
			
 
				 
			
 
				 class SeqSelfAttention(keras.layers.Layer):
			
@@ -237,4 +238,42 @@ class SeqSelfAttention(keras.layers.Layer):
 
				 
			
 
				     @staticmethod
			
 
				     def get_custom_objects():
			
 
				-        return {'SeqSelfAttention': SeqSelfAttention}
			
 
				+        return {'SeqSelfAttention': SeqSelfAttention}
			
 
				+
			
 
				+
			
 
				+class MySelfAttention(Layer):
			
 
				+    def __init__(self, output_dim, **kwargs):
			
 
				+        self.output_dim = output_dim
			
 
				+        super(MySelfAttention, self).__init__(**kwargs)
			
 
				+
			
 
				+    def build(self, input_shape):
			
 
				+        # inputs.shape = (batch_size, time_steps, seq_len)
			
 
				+        self.W_Q = self.add_weight(name='W_Q',
			
 
				+                                   shape=(input_shape[2], self.output_dim),
			
 
				+                                   initializer='uniform',
			
 
				+                                   trainable=True)
			
 
				+        self.W_K = self.add_weight(name='W_K',
			
 
				+                                   shape=(input_shape[2], self.output_dim),
			
 
				+                                   initializer='uniform',
			
 
				+                                   trainable=True)
			
 
				+        self.W_V = self.add_weight(name='W_V',
			
 
				+                                   shape=(input_shape[2], self.output_dim),
			
 
				+                                   initializer='uniform',
			
 
				+                                   trainable=True)
			
 
				+
			
 
				+        super(MySelfAttention, self).build(input_shape)
			
 
				+
			
 
				+    def call(self, x, mask=None, **kwargs):
			
 
				+        _Q = K.dot(x, self.W_Q)
			
 
				+        _K = K.dot(x, self.W_K)
			
 
				+        _V = K.dot(x, self.W_V)
			
 
				+
			
 
				+        # batch_dot代替K.T
			
 
				+        _Z = K.batch_dot(_Q, K.permute_dimensions(_K, [0, 2, 1]))
			
 
				+        _Z = _Z / (self.output_dim**0.5)
			
 
				+        _Z = K.softmax(_Z)
			
 
				+        _Z = K.batch_dot(_Z, _V)
			
 
				+        return _Z
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return input_shape[0], input_shape[1], self.output_dim
			
--- a/BiddingKG/dl/table_head/models/tf_bi_lstm.py
+++ b/BiddingKG/dl/table_head/models/tf_bi_lstm.py
@@ -0,0 +1,49 @@
 
				+import tensorflow as tf
			
 
				+from tensorflow.contrib.rnn import LSTMCell
			
 
				+from tensorflow.contrib.rnn import MultiRNNCell
			
 
				+
			
 
				+
			
 
				+class LstmBase:
			
 
				+    """
			
 
				+    build rnn cell
			
 
				+    """
			
 
				+    def build_rnn(self, hidden_size, num_layes):
			
 
				+        cells = []
			
 
				+        for i in range(num_layes):
			
 
				+            cell = LSTMCell(num_units=hidden_size,
			
 
				+                            state_is_tuple=True,
			
 
				+                            initializer=tf.random_uniform_initializer(-0.25, 0.25))
			
 
				+            cells.append(cell)
			
 
				+        cells = MultiRNNCell(cells, state_is_tuple=True)
			
 
				+
			
 
				+        return cells
			
 
				+
			
 
				+
			
 
				+class BiLstm(LstmBase):
			
 
				+    """
			
 
				+    define the lstm
			
 
				+    """
			
 
				+    def __init__(self, scope_name, hidden_size, num_layers):
			
 
				+        super(BiLstm, self).__init__()
			
 
				+        assert hidden_size % 2 == 0
			
 
				+        hidden_size /= 2
			
 
				+
			
 
				+        self.fw_rnns = []
			
 
				+        self.bw_rnns = []
			
 
				+        for i in range(num_layers):
			
 
				+            self.fw_rnns.append(self.build_rnn(hidden_size, 1))
			
 
				+            self.bw_rnns.append(self.build_rnn(hidden_size, 1))
			
 
				+
			
 
				+        self.scope_name = scope_name
			
 
				+
			
 
				+    def __call__(self, input, input_len):
			
 
				+        for idx, (fw_rnn, bw_rnn) in enumerate(zip(self.fw_rnns, self.bw_rnns)):
			
 
				+            scope_name = '{}_{}'.format(self.scope_name, idx)
			
 
				+            ctx, _ = tf.nn.bidirectional_dynamic_rnn(
			
 
				+                fw_rnn, bw_rnn, input, sequence_length=input_len,
			
 
				+                dtype=tf.float32, time_major=False,
			
 
				+                scope=scope_name
			
 
				+            )
			
 
				+            input = tf.concat(ctx, -1)
			
 
				+        ctx = input
			
 
				+        return ctx
			
--- a/BiddingKG/dl/table_head/models/u_net.py
+++ b/BiddingKG/dl/table_head/models/u_net.py
@@ -0,0 +1,82 @@
 
				+from keras.layers import Input, concatenate, Conv2D, MaxPooling2D, BatchNormalization, UpSampling2D
			
 
				+from keras.layers import LeakyReLU
			
 
				+
			
 
				+
			
 
				+def u_net_small(inputs, num_classes=1):
			
 
				+    # 8
			
 
				+    use_bias = False
			
 
				+    down0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(inputs)
			
 
				+    down0 = BatchNormalization()(down0)
			
 
				+    down0 = LeakyReLU(alpha=0.)(down0)
			
 
				+    down0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(down0)
			
 
				+    down0 = BatchNormalization()(down0)
			
 
				+    down0 = LeakyReLU(alpha=0.)(down0)
			
 
				+    down0_pool = MaxPooling2D((2, 2), strides=(2, 2))(down0)
			
 
				+
			
 
				+    # 4
			
 
				+    down1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(down0_pool)
			
 
				+    down1 = BatchNormalization()(down1)
			
 
				+    down1 = LeakyReLU(alpha=0.)(down1)
			
 
				+    down1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(down1)
			
 
				+    down1 = BatchNormalization()(down1)
			
 
				+    down1 = LeakyReLU(alpha=0.)(down1)
			
 
				+    down1_pool = MaxPooling2D((2, 2), strides=(2, 2))(down1)
			
 
				+
			
 
				+    # 2
			
 
				+    down2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(down1_pool)
			
 
				+    down2 = BatchNormalization()(down2)
			
 
				+    down2 = LeakyReLU(alpha=0.)(down2)
			
 
				+    down2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(down2)
			
 
				+    down2 = BatchNormalization()(down2)
			
 
				+    down2 = LeakyReLU(alpha=0.)(down2)
			
 
				+    down2_pool = MaxPooling2D((2, 2), strides=(2, 2))(down2)
			
 
				+
			
 
				+    center = Conv2D(64, (3, 3), padding='same', use_bias=use_bias)(down2_pool)
			
 
				+    center = BatchNormalization()(center)
			
 
				+    center = LeakyReLU(alpha=0.)(center)
			
 
				+    center = Conv2D(64, (3, 3), padding='same', use_bias=use_bias)(center)
			
 
				+    center = BatchNormalization()(center)
			
 
				+    center = LeakyReLU(alpha=0.)(center)
			
 
				+
			
 
				+    # 2
			
 
				+    up2 = UpSampling2D((2, 2))(center)
			
 
				+    up2 = concatenate([down2, up2], axis=3)
			
 
				+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
			
 
				+    up2 = BatchNormalization()(up2)
			
 
				+    up2 = LeakyReLU(alpha=0.)(up2)
			
 
				+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
			
 
				+    up2 = BatchNormalization()(up2)
			
 
				+    up2 = LeakyReLU(alpha=0.)(up2)
			
 
				+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
			
 
				+    up2 = BatchNormalization()(up2)
			
 
				+    up2 = LeakyReLU(alpha=0.)(up2)
			
 
				+
			
 
				+    # 4
			
 
				+    up1 = UpSampling2D((2, 2))(up2)
			
 
				+    up1 = concatenate([down1, up1], axis=3)
			
 
				+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
			
 
				+    up1 = BatchNormalization()(up1)
			
 
				+    up1 = LeakyReLU(alpha=0.)(up1)
			
 
				+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
			
 
				+    up1 = BatchNormalization()(up1)
			
 
				+    up1 = LeakyReLU(alpha=0.)(up1)
			
 
				+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
			
 
				+    up1 = BatchNormalization()(up1)
			
 
				+    up1 = LeakyReLU(alpha=0.)(up1)
			
 
				+
			
 
				+    # 8
			
 
				+    up0 = UpSampling2D((2, 2))(up1)
			
 
				+    up0 = concatenate([down0, up0], axis=3)
			
 
				+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
			
 
				+    up0 = BatchNormalization()(up0)
			
 
				+    up0 = LeakyReLU(alpha=0.)(up0)
			
 
				+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
			
 
				+    up0 = BatchNormalization()(up0)
			
 
				+    up0 = LeakyReLU(alpha=0.)(up0)
			
 
				+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
			
 
				+    up0 = BatchNormalization()(up0)
			
 
				+    up0 = LeakyReLU(alpha=0.)(up0)
			
 
				+
			
 
				+    # classify
			
 
				+    # classify = Conv2D(num_classes, (1, 1), activation='sigmoid')(up0)
			
 
				+    return up0
			
--- a/BiddingKG/dl/table_head/post_process.py
+++ b/BiddingKG/dl/table_head/post_process.py
@@ -24,3 +24,21 @@ def table_post_process(table_text_list, predict_result, threshold=0.5):
 
				         print("table_post_process 输出label维度与text不一致!")
			
 
				         table_label_list = []
			
 
				     return table_label_list
			
 
				+
			
 
				+
			
 
				+def table_post_process_2(table_text_list, predict_result, threshold=0.5):
			
 
				+    predict_result = predict_result.tolist()[0]
			
 
				+    predict_list = []
			
 
				+    for row in predict_result:
			
 
				+        new_row = []
			
 
				+        for col in row:
			
 
				+            if col >= threshold:
			
 
				+                new_row.append("1")
			
 
				+            else:
			
 
				+                new_row.append("0")
			
 
				+        predict_list.append(new_row)
			
 
				+
			
 
				+    if len(predict_list) != len(predict_result):
			
 
				+        print("table_post_process 输出label维度与text不一致!")
			
 
				+        predict_list = []
			
 
				+    return predict_list
			
--- a/BiddingKG/dl/table_head/postgresql2csv.py
+++ b/BiddingKG/dl/table_head/postgresql2csv.py
@@ -21,23 +21,29 @@ def eval_text_list(table_text):
 
				 def read_postgresql(txt_name, start_id, _time):
			
 
				     conn = psycopg2.connect(database="table_head_label", user="postgres",
			
 
				                             password="postgres", host="192.168.2.103", port="5432")
			
 
				-
			
 
				-    with open('check_user_result/' + txt_name, "r") as f:
			
 
				-        id_list = f.readlines()
			
 
				-    # with open('check_user_result/test27.txt', "r") as f:
			
 
				-    #     id_list += f.readlines()
			
 
				-
			
 
				-    _list = []
			
 
				-    for _id in id_list:
			
 
				-        _id = _id[:-1]
			
 
				-        sql = 'select * from label_table_head_info where id =' + _id
			
 
				+    row_list = []
			
 
				+    if txt_name == "":
			
 
				+        sql = """
			
 
				+        select * from "label_table_head_info" 
			
 
				+        where status = 1 and update_time >= '2022-01-17';
			
 
				+        """
			
 
				         df = pd.read_sql(sql=sql, con=conn)
			
 
				-        # df = df[0]
			
 
				         for index, row in df.iterrows():
			
 
				-            _list.append([x for x in row])
			
 
				-    cnt = 0
			
 
				+            row_list.append([x for x in row])
			
 
				+    else:
			
 
				+        with open('check_user_result/' + txt_name, "r") as f:
			
 
				+            id_list = f.readlines()
			
 
				+        for _id in id_list:
			
 
				+            _id = _id[:-1]
			
 
				+            sql = 'select * from label_table_head_info where id =' + _id
			
 
				+            df = pd.read_sql(sql=sql, con=conn)
			
 
				+            # df = df[0]
			
 
				+            for index, row in df.iterrows():
			
 
				+                row_list.append([x for x in row])
			
 
				+        cnt = 0
			
 
				+
			
 
				     new_list = []
			
 
				-    for line in _list:
			
 
				+    for line in row_list:
			
 
				         try:
			
 
				             table_text = eval_text_list(line[2])
			
 
				         except:
			
@@ -57,7 +63,7 @@ def read_postgresql(txt_name, start_id, _time):
 
				         label_list = predict(table_text)
			
 
				         line[3] = str(label_list)
			
 
				         new_list.append(line)
			
 
				-    df = pd.DataFrame(_list)
			
 
				+    df = pd.DataFrame(new_list)
			
 
				     new_csv_path = "data_new.csv"
			
 
				 
			
 
				     df.to_csv(new_csv_path, index=False)
			
@@ -66,7 +72,7 @@ def read_postgresql(txt_name, start_id, _time):
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    new_csv_path = read_postgresql('test20_error.txt', 203995, '2022-01-01 00:00:00')
			
 
				+    new_csv_path = read_postgresql('test11_error.txt', 206863, '2021-12-31 00:00:00')
			
 
				     # new_csv_path = read_postgresql('test20_right.txt', 203995, '')
			
 
				     # df = pd.read_csv('data_new.csv')
			
 
				     # print(df.iloc[:, 4])
			
--- a/BiddingKG/dl/table_head/pre_process.py
+++ b/BiddingKG/dl/table_head/pre_process.py
@@ -1,8 +1,10 @@
 
				+import os
			
 
				 import random
			
 
				-
			
 
				+import sys
			
 
				 import psycopg2
			
 
				 import numpy as np
			
 
				-from BiddingKG.dl.common.Utils import embedding_word
			
 
				+sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				+from common.Utils import embedding_word, embedding_word_forward
			
 
				 
			
 
				 
			
 
				 def get_sentence_index_list(sentence, dict_path='utils/ppocr_keys_v1.txt'):
			
@@ -42,22 +44,28 @@ def postgresql_util(sql, limit):
 
				     return all_rows
			
 
				 
			
 
				 
			
 
				-def get_data_from_sql(dim=10):
			
 
				+def get_data_from_sql(dim=10, whole_table=False, padding=True):
			
 
				+    sql = """
			
 
				+    select table_text, pre_label, post_label, id
			
 
				+    from label_table_head_info
			
 
				+    where status = 0 and (update_user='test9' or update_user='test1' or update_user='test7' or update_user='test26')
			
 
				+    ;
			
 
				+    """
			
 
				     # sql = """
			
 
				     # select table_text, pre_label, post_label, id
			
 
				     # from label_table_head_info
			
 
				-    # where update_user <> 'test27' and update_user <> 'test20' and table_box_cnt >= 4 and table_box_cnt <= 200
			
 
				+    # where status = 1 and update_time >= '2022-01-17' and update_time <= '2022-01-22'
			
 
				     # ;
			
 
				     # """
			
 
				-    sql = """
			
 
				-    select table_text, pre_label, post_label, id
			
 
				-    from label_table_head_info 
			
 
				-    where status = 1 and update_time >= '2022-01-17'
			
 
				-    ;
			
 
				-    """
			
 
				 
			
 
				     result_list = postgresql_util(sql, limit=1000000)
			
 
				 
			
 
				+    # 需排除的id
			
 
				+    with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
			
 
				+        delete_id_list = eval(f.read())
			
 
				+    with open(r"C:\Users\Administrator\Desktop\table_delete.txt", "r") as f:
			
 
				+        delete_id_list += eval(f.read())
			
 
				+
			
 
				     all_data_list = []
			
 
				     all_data_label_list = []
			
 
				     i = 0
			
@@ -71,6 +79,10 @@ def get_data_from_sql(dim=10):
 
				         post_label = eval(table[2])
			
 
				         _id = table[3]
			
 
				 
			
 
				+        if _id in delete_id_list:
			
 
				+            print("pass", _id)
			
 
				+            continue
			
 
				+
			
 
				         # table_text需要特殊处理
			
 
				         try:
			
 
				             table_text = table[0]
			
@@ -84,17 +96,35 @@ def get_data_from_sql(dim=10):
 
				             print("无法识别table_text", _id)
			
 
				             continue
			
 
				 
			
 
				-        # 只有一行的也不要
			
 
				-        if len(post_label) >= 2:
			
 
				-            data_list, data_label_list = table_pre_process(table_text, post_label, _id)
			
 
				-        elif len(pre_label) >= 2:
			
 
				-            data_list, data_label_list = table_pre_process(table_text, pre_label, _id)
			
 
				+        if whole_table:
			
 
				+            if len(post_label) >= 2:
			
 
				+                data_list, data_label_list = table_pre_process_2(table_text, post_label,
			
 
				+                                                                 _id, padding=padding)
			
 
				+            elif len(pre_label) >= 2:
			
 
				+                data_list, data_label_list = table_pre_process_2(table_text, pre_label,
			
 
				+                                                                 _id, padding=padding)
			
 
				+            else:
			
 
				+                data_list, data_label_list = [], []
			
 
				         else:
			
 
				-            data_list, data_label_list = [], []
			
 
				+            # 只有一行的也不要
			
 
				+            if len(post_label) >= 2:
			
 
				+                data_list, data_label_list = table_pre_process(table_text, post_label, _id)
			
 
				+            elif len(pre_label) >= 2:
			
 
				+                data_list, data_label_list = table_pre_process(table_text, pre_label, _id)
			
 
				+            else:
			
 
				+                data_list, data_label_list = [], []
			
 
				 
			
 
				         all_data_list += data_list
			
 
				         all_data_label_list += data_label_list
			
 
				 
			
 
				+    # 按维度大小排序
			
 
				+    if whole_table:
			
 
				+        _list = []
			
 
				+        for data, label in zip(all_data_list, all_data_label_list):
			
 
				+            _list.append([data, label])
			
 
				+        _list.sort(key=lambda x: (len(x[0]), len(x[0][0])))
			
 
				+        all_data_list[:], all_data_label_list[:] = zip(*_list)
			
 
				+
			
 
				     print("len(all_data_list)", len(all_data_list))
			
 
				     return all_data_list, all_data_label_list
			
 
				 
			
@@ -206,7 +236,84 @@ def table_pre_process(text_list, label_list, _id, is_train=True):
 
				         return data_list
			
 
				 
			
 
				 
			
 
				-def get_data_from_file(file_type):
			
 
				+def table_pre_process_2(text_list, label_list, _id, is_train=True, padding=True):
			
 
				+    """
			
 
				+    表格处理，整个表格为一个数组，且填充长宽维度
			
 
				+
			
 
				+    :param text_list:
			
 
				+    :param label_list:
			
 
				+    :param _id:
			
 
				+    :param is_train:
			
 
				+    :return:
			
 
				+    """
			
 
				+    # 判断表格长宽是否合理
			
 
				+    row_len = len(text_list)
			
 
				+    best_row_len = get_best_padding_size(row_len, min_len=8)
			
 
				+    col_len = len(text_list[0])
			
 
				+    best_col_len = get_best_padding_size(col_len, min_len=8)
			
 
				+    if best_row_len is None:
			
 
				+        if is_train:
			
 
				+            return [], []
			
 
				+        else:
			
 
				+            return []
			
 
				+    if best_col_len is None:
			
 
				+        if is_train:
			
 
				+            return [], []
			
 
				+        else:
			
 
				+            return []
			
 
				+
			
 
				+    if is_train:
			
 
				+        if len(text_list) != len(label_list):
			
 
				+            print("文字单元格与标注单元格数量不匹配！", _id)
			
 
				+            print("len(text_list)", len(text_list), "len(label_list)", len(label_list))
			
 
				+            return [], []
			
 
				+
			
 
				+        if padding:
			
 
				+            for i in range(row_len):
			
 
				+                col_len = len(text_list[i])
			
 
				+                text_list[i] += [None]*(best_col_len-col_len)
			
 
				+                if is_train:
			
 
				+                    label_list[i] += ["0"]*(best_col_len-col_len)
			
 
				+            text_list += [[None]*best_col_len]*(best_row_len-row_len)
			
 
				+            if is_train:
			
 
				+                label_list += [["0"]*best_col_len]*(best_row_len-row_len)
			
 
				+
			
 
				+    if is_train:
			
 
				+        for i in range(len(label_list)):
			
 
				+            for j in range(len(label_list[i])):
			
 
				+                label_list[i][j] = int(label_list[i][j])
			
 
				+        return [text_list], [label_list]
			
 
				+    else:
			
 
				+        return [text_list]
			
 
				+
			
 
				+
			
 
				+def get_best_padding_size(axis_len, min_len=3, max_len=300):
			
 
				+    # sizes = [8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
			
 
				+    #          128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224,
			
 
				+    #          232, 240, 248, 256, 264, 272, 280, 288, 296]
			
 
				+    # sizes = [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57,
			
 
				+    #          60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99, 102, 105, 108, 111,
			
 
				+    #          114, 117, 120, 123, 126, 129, 132, 135, 138, 141, 144, 147, 150, 153, 156,
			
 
				+    #          159, 162, 165, 168, 171, 174, 177, 180, 183, 186, 189, 192, 195, 198, 201,
			
 
				+    #          204, 207, 210, 213, 216, 219, 222, 225, 228, 231, 234, 237, 240, 243, 246,
			
 
				+    #          249, 252, 255, 258, 261, 264, 267, 270, 273, 276, 279, 282, 285, 288, 291,
			
 
				+    #          294, 297]
			
 
				+    sizes = []
			
 
				+    for i in range(1, max_len):
			
 
				+        if i * min_len <= max_len:
			
 
				+            sizes.append(i * min_len)
			
 
				+    if axis_len > sizes[-1]:
			
 
				+        return axis_len
			
 
				+    best_len = sizes[-1]
			
 
				+    for height in sizes:
			
 
				+        if axis_len <= height:
			
 
				+            best_len = height
			
 
				+            break
			
 
				+    # print("get_best_padding_size", axis_len, best_len)
			
 
				+    return best_len
			
 
				+
			
 
				+
			
 
				+def get_data_from_file(file_type, model_id=1):
			
 
				     if file_type == 'np':
			
 
				         data_path = 'train_data/data_3.npy'
			
 
				         data_label_path = 'train_data/data_label_3.npy'
			
@@ -215,17 +322,20 @@ def get_data_from_file(file_type):
 
				         array2 = np.load(data_label_path)
			
 
				         return array1, array2
			
 
				     elif file_type == 'txt':
			
 
				-        data_path = 'train_data/data3.txt'
			
 
				-        data_label_path = 'train_data/data_label3.txt'
			
 
				-
			
 
				+        if model_id == 1:
			
 
				+            data_path = 'train_data/data1.txt'
			
 
				+            data_label_path = 'train_data/data_label1.txt'
			
 
				+        elif model_id == 2:
			
 
				+            data_path = 'train_data/data2.txt'
			
 
				+            data_label_path = 'train_data/data_label2.txt'
			
 
				+        elif model_id == 3:
			
 
				+            data_path = 'train_data/data3.txt'
			
 
				+            data_label_path = 'train_data/data_label3.txt'
			
 
				         with open(data_path, 'r') as f:
			
 
				             data_list = f.readlines()
			
 
				         with open(data_label_path, 'r') as f:
			
 
				             data_label_list = f.readlines()
			
 
				 
			
 
				-        # for i in range(len(data_list)):
			
 
				-        #     data_list[i] = eval(data_list[i][:-1])
			
 
				-        #     data_label_list[i] = eval(data_label_list[i][:-1])
			
 
				         return data_list, data_label_list
			
 
				     else:
			
 
				         print("file type error! only np and txt supported")
			
@@ -245,18 +355,19 @@ def processed_save_to_np():
 
				     #         f.write(str(line) + "\n")
			
 
				 
			
 
				 
			
 
				-def processed_save_to_txt():
			
 
				-    list1, list2 = get_data_from_sql()
			
 
				+def processed_save_to_txt(whole_table=False, padding=True):
			
 
				+    list1, list2 = get_data_from_sql(whole_table=whole_table, padding=padding)
			
 
				 
			
 
				     # 打乱
			
 
				+    # if not whole_table or not padding:
			
 
				     zip_list = list(zip(list1, list2))
			
 
				     random.shuffle(zip_list)
			
 
				     list1[:], list2[:] = zip(*zip_list)
			
 
				 
			
 
				-    with open('train_data/data3.txt', 'w') as f:
			
 
				+    with open('train_data/data1.txt', 'w') as f:
			
 
				         for line in list1:
			
 
				             f.write(str(line) + "\n")
			
 
				-    with open('train_data/data_label3.txt', 'w') as f:
			
 
				+    with open('train_data/data_label1.txt', 'w') as f:
			
 
				         for line in list2:
			
 
				             f.write(str(line) + "\n")
			
 
				 
			
@@ -287,7 +398,7 @@ def my_data_loader(data_list, data_label_list, batch_size, is_train=True):
 
				     data_num = len(data_list)
			
 
				 
			
 
				     # 定义Embedding输出
			
 
				-    output_shape = (6, 10, 60)
			
 
				+    output_shape = (6, 20, 60)
			
 
				 
			
 
				     # batch循环取数据
			
 
				     i = 0
			
@@ -349,8 +460,109 @@ def my_data_loader(data_list, data_label_list, batch_size, is_train=True):
 
				                    'input_4': X[3], 'input_5': X[4], 'input_6': X[5], }
			
 
				 
			
 
				 
			
 
				+def my_data_loader_2(table_list, table_label_list, batch_size, is_train=True):
			
 
				+    pad_len = 0
			
 
				+
			
 
				+    table_num = len(table_list)
			
 
				+    if is_train and batch_size == 1:
			
 
				+        table_list, table_label_list = get_random(table_list, table_label_list)
			
 
				+
			
 
				+    # Embedding shape
			
 
				+    output_shape = (20, 60)
			
 
				+
			
 
				+    # batch循环取数据
			
 
				+    i = 0
			
 
				+    last_shape = None
			
 
				+    while True:
			
 
				+        new_table_list = []
			
 
				+        new_table_label_list = []
			
 
				+        for j in range(batch_size):
			
 
				+            if i >= table_num:
			
 
				+                i = 0
			
 
				+                if is_train:
			
 
				+                    table_list, table_label_list = get_random(table_list, table_label_list,
			
 
				+                                                              seed=random.randint(1, 40))
			
 
				+
			
 
				+            if type(table_list[i]) != list:
			
 
				+                table = eval(table_list[i][:-1])
			
 
				+            else:
			
 
				+                table = table_list[i]
			
 
				+
			
 
				+            if batch_size > 1:
			
 
				+                if last_shape is None:
			
 
				+                    last_shape = (len(table), len(table[0]))
			
 
				+                    continue
			
 
				+                if (len(table), len(table[0])) != last_shape:
			
 
				+                    last_shape = (len(table), len(table[0]))
			
 
				+                    break
			
 
				+
			
 
				+            if is_train:
			
 
				+                table_label = eval(table_label_list[i][:-1])
			
 
				+
			
 
				+            # 中文字符映射为Embedding
			
 
				+            for k in range(len(table)):
			
 
				+                table[k] = embedding_word_forward(table[k], (len(table[k]),
			
 
				+                                                     output_shape[0],
			
 
				+                                                     output_shape[1]))
			
 
				+            new_table_list.append(table)
			
 
				+            if is_train:
			
 
				+                new_table_label_list.append(table_label)
			
 
				+            i += 1
			
 
				+        new_table_list = np.array(new_table_list)
			
 
				+        X = new_table_list
			
 
				+        if X.shape[-2:] != output_shape:
			
 
				+            # print("Dimension not match!", X.shape)
			
 
				+            # print("\n")
			
 
				+            continue
			
 
				+
			
 
				+        # 获取Padding大小
			
 
				+        pad_height = get_best_padding_size(X.shape[1], pad_len)
			
 
				+        pad_width = get_best_padding_size(X.shape[2], pad_len)
			
 
				+        input_2 = np.zeros([1, X.shape[1], X.shape[2], pad_height, pad_width])
			
 
				+
			
 
				+        if is_train:
			
 
				+            new_table_label_list = np.array(new_table_label_list)
			
 
				+            Y = new_table_label_list
			
 
				+            # Y = Y.astype(np.float32)
			
 
				+            # yield {"input_1": X, "input_2": input_2}, \
			
 
				+            #       {"output_1": Y, "output_2": Y}
			
 
				+            yield {"input_1": X, "input_2": input_2}, \
			
 
				+                  {"output": Y}
			
 
				+        else:
			
 
				+            yield {"input_1": X, "input_2": input_2}
			
 
				+
			
 
				+
			
 
				+def check_train_data():
			
 
				+    data_list, label_list = get_data_from_file('txt', model_id=2)
			
 
				+    for data in data_list:
			
 
				+        data = eval(data)
			
 
				+        if len(data) % 8 != 0:
			
 
				+            print(len(data))
			
 
				+            print(len(data[0]))
			
 
				+        for row in data:
			
 
				+            if len(row) % 8 != 0:
			
 
				+                print(len(data))
			
 
				+                print(len(row))
			
 
				+
			
 
				+
			
 
				+def get_random(text_list, label_list, seed=42):
			
 
				+    random.seed(seed)
			
 
				+    zip_list = list(zip(text_list, label_list))
			
 
				+    random.shuffle(zip_list)
			
 
				+    text_list[:], label_list[:] = zip(*zip_list)
			
 
				+    return text_list, label_list
			
 
				+
			
 
				+
			
 
				 if __name__ == '__main__':
			
 
				-    processed_save_to_txt()
			
 
				+    processed_save_to_txt(whole_table=False, padding=False)
			
 
				     # data_balance()
			
 
				 
			
 
				     # test_embedding()
			
 
				+    # check_train_data()
			
 
				+
			
 
				+    # _list = []
			
 
				+    # for i in range(1, 100):
			
 
				+    #     _list.append(i*3)
			
 
				+    # print(_list)
			
 
				+
			
 
				+    # print(get_best_padding_size(9, 5))
			
--- a/BiddingKG/dl/table_head/predict.py
+++ b/BiddingKG/dl/table_head/predict.py
--- a/BiddingKG/dl/table_head/preprocessing_test.py
+++ b/BiddingKG/dl/table_head/preprocessing_test.py
@@ -0,0 +1,17 @@
 
				+import codecs
			
 
				+import pandas as pd
			
 
				+from bs4 import BeautifulSoup
			
 
				+from BiddingKG.dl.interface.extract import predict
			
 
				+
			
 
				+
			
 
				+def test():
			
 
				+    df = pd.read_excel("has_table_no_attach.xlsx")
			
 
				+    for index, row in df.iterrows():
			
 
				+        if index % 100 == 0:
			
 
				+            print("Loop", index)
			
 
				+        text = row['dochtmlcon']
			
 
				+        predict(str(index), text)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    test()
			
--- a/BiddingKG/dl/table_head/table_simplify.py
+++ b/BiddingKG/dl/table_head/table_simplify.py
@@ -0,0 +1,188 @@
 
				+#coding:utf-8
			
 
				+import json
			
 
				+import logging
			
 
				+
			
 
				+from BiddingKG.dl.table_head.pre_process import postgresql_util
			
 
				+
			
 
				+
			
 
				+user_score = {
			
 
				+    "test": 1.,
			
 
				+    "test1": 0.83,
			
 
				+    "test11": 0.82,
			
 
				+    "test12": 0.74,
			
 
				+    "test16": 0.83,
			
 
				+    "test17": 0.77,
			
 
				+    "test19": 0.79,
			
 
				+    "test20": 0.82,
			
 
				+    "test21": 0.73,
			
 
				+    "test22": 0.64,
			
 
				+    "test25": 0.77,
			
 
				+    "test26": 0.80,
			
 
				+    "test27": 0.72,
			
 
				+    "test29": 0.8,
			
 
				+    "test3": 0.,
			
 
				+    "test7": 0.82,
			
 
				+    "test8": 0.78,
			
 
				+    "test9": 0.80,
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def get_labeled_table():
			
 
				+    sql = """
			
 
				+    select id, update_user, table_text, pre_label, post_label
			
 
				+    from label_table_head_info where status = 0
			
 
				+    """
			
 
				+
			
 
				+    result_list = postgresql_util(sql, limit=1000000)
			
 
				+    print("len(result_list)", len(result_list))
			
 
				+    with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
			
 
				+        not_eval_table_list = f.read()
			
 
				+    not_eval_table_list = eval(not_eval_table_list)
			
 
				+
			
 
				+    table_list = []
			
 
				+    # not_eval_table_list = []
			
 
				+    for table in result_list:
			
 
				+        pre_label = eval(table[3])
			
 
				+        post_label = eval(table[4])
			
 
				+        _id = table[0]
			
 
				+        update_user = table[1]
			
 
				+        table_text = table[2]
			
 
				+        if _id in not_eval_table_list:
			
 
				+            continue
			
 
				+
			
 
				+        try:
			
 
				+            if table_text[0] == '"':
			
 
				+                table_text = eval(table_text)
			
 
				+            else:
			
 
				+                table_text = table_text
			
 
				+            table_text = table_text.replace('\\', '/')
			
 
				+            table_text = eval(table_text)
			
 
				+        except:
			
 
				+            print("无法识别table_text", _id)
			
 
				+            not_eval_table_list.append(_id)
			
 
				+            continue
			
 
				+
			
 
				+        if post_label:
			
 
				+            label_list = post_label
			
 
				+        else:
			
 
				+            label_list = pre_label
			
 
				+
			
 
				+        table_list.append([table_text, label_list, update_user, _id])
			
 
				+    print("len(table_list)", len(table_list))
			
 
				+    # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "w") as f:
			
 
				+    #     f.write(str(not_eval_table_list))
			
 
				+    return table_list
			
 
				+
			
 
				+
			
 
				+def table_distance(table1, table2, thresh=0.85):
			
 
				+    # flatten
			
 
				+    table1 = [col for row in table1 for col in row]
			
 
				+    table2 = [col for row in table2 for col in row]
			
 
				+    while "" in table1:
			
 
				+        table1.remove("")
			
 
				+    while "" in table2:
			
 
				+        table2.remove("")
			
 
				+
			
 
				+    equal_cnt = 0
			
 
				+    not_equal_cnt = 0
			
 
				+    equal_flag = 0
			
 
				+    for col1 in table1:
			
 
				+        find_flag = 0
			
 
				+        for col2 in table2:
			
 
				+            if col1 == col2:
			
 
				+                equal_cnt += 1
			
 
				+                find_flag = 1
			
 
				+                break
			
 
				+        if not find_flag:
			
 
				+            not_equal_cnt += 1
			
 
				+        # print(equal_cnt, not_equal_cnt)
			
 
				+        if round(equal_cnt / max(len(table1), len(table2)), 2) >= thresh:
			
 
				+            # print("> thresh")
			
 
				+            equal_flag = 1
			
 
				+            break
			
 
				+        if round(not_equal_cnt / max(len(table1), len(table2)), 2) >= 1-thresh:
			
 
				+            # print("> 1-thresh")
			
 
				+            equal_flag = 0
			
 
				+            break
			
 
				+    return equal_flag
			
 
				+
			
 
				+
			
 
				+def remove_duplicate(table_list):
			
 
				+    logging.info("into remove_duplicate")
			
 
				+    table_list.sort(key=lambda x: x[0])
			
 
				+    delete_table_id_list = []
			
 
				+    for i in range(len(table_list)):
			
 
				+        delete_table_id_list = list(set(delete_table_id_list))
			
 
				+        if i % 1000 == 0:
			
 
				+            print("Loop", i, "len(delete_table_id_list)", len(delete_table_id_list))
			
 
				+            logging.info("*")
			
 
				+            with open(r"C:\Users\Administrator\Desktop\table_delete.txt", "w") as f:
			
 
				+                f.write(str(delete_table_id_list))
			
 
				+        table1 = table_list[i]
			
 
				+        if len(table1[0]) <= 2 and len(table1[0][0]) <= 2:
			
 
				+            delete_table_id_list.append(table1[3])
			
 
				+            continue
			
 
				+        for j in range(i+1, len(table_list)):
			
 
				+            table2 = table_list[j]
			
 
				+            if len(table2[0]) <= 2 and len(table2[0][0]) <= 2:
			
 
				+                delete_table_id_list.append(table2[3])
			
 
				+                continue
			
 
				+            # 行数相差2以上忽略
			
 
				+            if abs(len(table1[0]) - len(table2[0])) >= 2:
			
 
				+                continue
			
 
				+            # 列数相差2以上忽略
			
 
				+            if abs(len(table1[0][0])) - len(table2[0][0]) >= 2:
			
 
				+                continue
			
 
				+            if table_distance(table1[0], table2[0]):
			
 
				+                print("equal", table1[3], table2[3])
			
 
				+                score1 = user_score.get(table1[2])
			
 
				+                score2 = user_score.get(table2[2])
			
 
				+                if score1 is None:
			
 
				+                    score1 = 0.
			
 
				+                if score2 is None:
			
 
				+                    score2 = 0.
			
 
				+                if score1 >= score2:
			
 
				+                    delete_table_id_list.append(table2[3])
			
 
				+                else:
			
 
				+                    delete_table_id_list.append(table1[3])
			
 
				+
			
 
				+    delete_table_id_list = list(set(delete_table_id_list))
			
 
				+    new_table_list = []
			
 
				+    for table in table_list:
			
 
				+        if table[3] not in delete_table_id_list:
			
 
				+            new_table_list.append(table)
			
 
				+    return new_table_list
			
 
				+
			
 
				+
			
 
				+def eval_table(_str):
			
 
				+    try:
			
 
				+        if _str[0] == '"':
			
 
				+            table_text = eval(_str)
			
 
				+        else:
			
 
				+            table_text = _str
			
 
				+        table_text = table_text.replace('\\', '/')
			
 
				+        table_text = eval(table_text)
			
 
				+    except:
			
 
				+        print("无法识别table_text")
			
 
				+        table_text = ""
			
 
				+    return table_text
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    _list = get_labeled_table()
			
 
				+    _list = remove_duplicate(_list)
			
 
				+    _str = json.dumps(str(_list))
			
 
				+    with open(r"C:\Users\Administrator\Desktop\table_simplify.txt", "w") as f:
			
 
				+        f.write(_str)
			
 
				+
			
 
				+    # _str1 = "[['', '', 'Yes']]"
			
 
				+    # _str2 = "[['', '', 'Yes', '']]"
			
 
				+    # table1 = eval_table(_str1)
			
 
				+    # table2 = eval_table(_str2)
			
 
				+    #
			
 
				+    # print(table_distance(table1, table2))
			
 
				+
			
 
				+    # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
			
 
				+    #     not_eval_table_list = f.read()
			
 
				+    # print(not_eval_table_list)
			
 
				+    # not_eval_table_list = eval(not_eval_table_list)
			
--- a/BiddingKG/dl/table_head/train.py
+++ b/BiddingKG/dl/table_head/train.py
@@ -2,24 +2,40 @@ import sys
 
				 import os
			
 
				 sys.path.append(os.path.abspath("../../.."))
			
 
				 os.environ['KERAS_BACKEND'] = 'tensorflow'
			
 
				-from keras.metrics import categorical_accuracy
			
 
				+from BiddingKG.dl.table_head.models.layer_utils import MyModelCheckpoint
			
 
				 from BiddingKG.dl.table_head.metrics import precision, recall, f1
			
 
				 from keras import optimizers, Model
			
 
				 from BiddingKG.dl.table_head.models.model import get_model
			
 
				 from BiddingKG.dl.table_head.loss import focal_loss
			
 
				 from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
			
 
				-from BiddingKG.dl.table_head.pre_process import get_data_from_file, get_data_from_sql, my_data_loader
			
 
				+from BiddingKG.dl.table_head.pre_process import get_data_from_file, get_data_from_sql, my_data_loader, my_data_loader_2, \
			
 
				+    get_random
			
 
				 from keras import backend as K
			
 
				 
			
 
				-
			
 
				-input_shape = (6, 10, 60)
			
 
				-output_shape = (1,)
			
 
				-batch_size = 32
			
 
				-epochs = 1000
			
 
				-pretrained_path = "checkpoints/best.hdf5"
			
 
				-checkpoint_path = "checkpoints/"
			
 
				-PRETRAINED = True
			
 
				-CHECKPOINT = False
			
 
				+model_id = 1
			
 
				+
			
 
				+if model_id == 1:
			
 
				+    input_shape = (6, 20, 60)
			
 
				+    output_shape = (1,)
			
 
				+    batch_size = 128
			
 
				+    epochs = 1000
			
 
				+    PRETRAINED = True
			
 
				+    CHECKPOINT = False
			
 
				+    # 用GPU
			
 
				+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
			
 
				+else:
			
 
				+    input_shape = (None, None, 20, 60)
			
 
				+    output_shape = (None, None)
			
 
				+    batch_size = 1
			
 
				+    epochs = 1000
			
 
				+    PRETRAINED = False
			
 
				+    CHECKPOINT = False
			
 
				+    # 用CPU
			
 
				+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
			
 
				+    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
			
 
				+
			
 
				+pretrained_path = "checkpoints/" + str(model_id) + "/best.hdf5"
			
 
				+checkpoint_path = "checkpoints/" + str(model_id) + "/"
			
 
				 
			
 
				 
			
 
				 def train():
			
@@ -27,22 +43,31 @@ def train():
 
				     print("gpus", K.tensorflow_backend._get_available_gpus())
			
 
				 
			
 
				     # Data
			
 
				-    data_x, data_y = get_data_from_file('txt')
			
 
				-    # data_x = data_x[:60000]
			
 
				-    # data_y = data_y[:60000]
			
 
				+    data_x, data_y = get_data_from_file('txt', model_id=model_id)
			
 
				     print("finish read data", len(data_x))
			
 
				 
			
 
				     # Split -> Train, Test
			
 
				-    split_size = int(len(data_x)*0.1)
			
 
				-    test_x, test_y = data_x[:split_size], data_y[:split_size]
			
 
				-    train_x, train_y = data_x[split_size:], data_y[split_size:]
			
 
				+    if model_id == 1:
			
 
				+        split_size = int(len(data_x)*0.1)
			
 
				+        test_x, test_y = data_x[:split_size], data_y[:split_size]
			
 
				+        train_x, train_y = data_x[split_size:], data_y[split_size:]
			
 
				+    else:
			
 
				+        data_x, data_y = get_random(data_x, data_y)
			
 
				+        split_size = int(len(data_x)*0.1)
			
 
				+        test_x, test_y = data_x[:split_size], data_y[:split_size]
			
 
				+        train_x, train_y = data_x[split_size:], data_y[split_size:]
			
 
				+    print("len(train_x), len(test_x)", len(train_x), len(test_x))
			
 
				 
			
 
				     # Data Loader
			
 
				-    train_data_loader = my_data_loader(train_x, train_y, batch_size=batch_size)
			
 
				-    test_data_loader = my_data_loader(test_x, test_y, batch_size=batch_size)
			
 
				+    if model_id == 1:
			
 
				+        train_data_loader = my_data_loader(train_x, train_y, batch_size=batch_size)
			
 
				+        test_data_loader = my_data_loader(test_x, test_y, batch_size=batch_size)
			
 
				+    else:
			
 
				+        train_data_loader = my_data_loader_2(train_x, train_y, batch_size=batch_size)
			
 
				+        test_data_loader = my_data_loader_2(test_x, test_y, batch_size=1)
			
 
				 
			
 
				     # Model
			
 
				-    model = get_model(input_shape, output_shape)
			
 
				+    model = get_model(input_shape, output_shape, model_id=model_id)
			
 
				     if PRETRAINED:
			
 
				         model.load_weights(pretrained_path)
			
 
				         print("read pretrained model", pretrained_path)
			
@@ -54,16 +79,20 @@ def train():
 
				     else:
			
 
				         print("no checkpoint")
			
 
				 
			
 
				-    filepath = 'e{epoch:02d}-f1{val_f1:.2f}'
			
 
				-    checkpoint = ModelCheckpoint(checkpoint_path+filepath+".hdf5", monitor='val_f1',
			
 
				-                                 verbose=1, save_best_only=True, mode='max')
			
 
				+    filepath = 'e-{epoch:02d}_f1-{val_f1:.2f}'
			
 
				+    # filepath = 'e-{epoch:02d}_acc-{val_loss:.2f}'
			
 
				+    checkpoint = ModelCheckpoint(checkpoint_path+filepath+".hdf5",
			
 
				+                                 monitor='val_f1',
			
 
				+                                 verbose=1,
			
 
				+                                 save_best_only=True,
			
 
				+                                 mode='max')
			
 
				 
			
 
				-    model.compile(optimizer=optimizers.Adam(lr=0.005), loss=focal_loss(),
			
 
				-    # model.compile(optimizer=optimizers.Adam(lr=0.005), loss='binary_crossentropy',
			
 
				-                  metrics=['acc',
			
 
				-                           precision, recall, f1])
			
 
				+    model.compile(optimizer=optimizers.Adam(lr=0.0005),
			
 
				+                  loss={"output": focal_loss(3., 0.5)},
			
 
				+                  # loss_weights={"output": 0.5},
			
 
				+                  metrics=['acc', precision, recall, f1])
			
 
				 
			
 
				-    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.1, patience=5,
			
 
				+    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.1, patience=10,
			
 
				                             verbose=1, mode='max', cooldown=0, min_lr=0)
			
 
				 
			
 
				     model.fit_generator(train_data_loader,
			
@@ -73,11 +102,6 @@ def train():
 
				                         validation_steps=max(1, len(test_x) // batch_size),
			
 
				                         epochs=epochs)
			
 
				 
			
 
				-    # model.fit(x=[train_x[0], train_x[1], train_x[2]], y=train_y,
			
 
				-    #           validation_data=([test_x[0], test_x[1], test_x[2]], test_y),
			
 
				-    #           epochs=epochs, batch_size=256, shuffle=True,
			
 
				-    #           callbacks=[checkpoint, rlu])
			
 
				-
			
 
				     return model, test_x
			
 
				 
			
 
				 
			
--- a/BiddingKG/dl/table_head/vocab_word.pk
+++ b/BiddingKG/dl/table_head/vocab_word.pk
--- a/BiddingKG/dl/test/test4.py
+++ b/BiddingKG/dl/test/test4.py
@@ -46,7 +46,7 @@ def test(name,content):
 
				 if __name__=="__main__":
			
 
				     # filename = "比地_52_79929693.html"
			
 
				     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
			
 
				-    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\test12354.txt","r",encoding="utf8").read()
			
 
				+    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
			
 
				     content = str(BeautifulSoup(text).find("div",id="pcontent"))
			
 
				     # df_a = {"html":[]}
			
 
				     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
			
@@ -75,7 +75,8 @@ if __name__=="__main__":
 
				     # '''
			
 
				     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
			
 
				     # print(predict("12", content,"打印机"))
			
 
				-    print(predict("12", text,"打印机"))
			
 
				+    # content = codecs.open("D:\\Project\\format_conversion_maxcompute\\result.html", "r",encoding="utf8").read()
			
 
				+    print(predict("12", content,"打印机"))
			
 
				     # test(12,content)
			
 
				     # test(12,text)
			
 
				     print("takes",time.time()-_time1)