3 年之前 · 503218e064
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,8 @@
 
															 /BiddingKG/dl/channel/data/
														
 
															 /BiddingKG/dl/test
														
 
															 node_modules
														
 
															+/BiddingKG/dl/table_head/train_data/
														
 
															+/BiddingKG/dl/table_head/check_user_result/
														
 
															+/BiddingKG/dl/table_head/checkpoints/
														
 
															+/BiddingKG/dl/table_head/data_new.csv
														
 
															+/BiddingKG/dl/table_head/has_table_no_attach.xlsx
														
--- a/.idea/compiler.xml
+++ b/.idea/compiler.xml
@@ -1,6 +1,7 @@
 
															 <?xml version="1.0" encoding="UTF-8"?>
														
 
															 <project version="4">
														
 
															   <component name="CompilerConfiguration">
														
 
															+    <option name="BUILD_PROCESS_HEAP_SIZE" value="11000" />
														
 
															     <bytecodeTargetLevel>
														
 
															       <module name="BiddingKG" target="8" />
														
 
															     </bytecodeTargetLevel>
														
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -2,5 +2,13 @@
 
															   <profile version="1.0">
														
 
															     <option name="myName" value="Project Default" />
														
 
															     <inspection_tool class="DuplicatedCode" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
														
 
															+    <inspection_tool class="PyInterpreterInspection" enabled="false" level="WARNING" enabled_by_default="false" />
														
 
															+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
														
 
															+      <option name="ignoredIdentifiers">
														
 
															+        <list>
														
 
															+          <option value="tensorflow.nn.bidirectional_dynamic_rnn" />
														
 
															+        </list>
														
 
															+      </option>
														
 
															+    </inspection_tool>
														
 
															   </profile>
														
 
															 </component>
														
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
 
															 <?xml version="1.0" encoding="UTF-8"?>
														
 
															 <project version="4">
														
 
															-  <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" project-jdk-type="Python SDK" />
														
 
															+  <component name="ProjectRootManager" version="2" languageLevel="JDK_15" project-jdk-name="Python 3.5 (BiddingKG)" project-jdk-type="Python SDK" />
														
 
															   <component name="PythonCompatibilityInspectionAdvertiser">
														
 
															     <option name="version" value="3" />
														
 
															   </component>
														
--- a/BiddingKG.iml
+++ b/BiddingKG.iml
@@ -2,13 +2,13 @@
 
															 <module type="JAVA_MODULE" version="4">
														
 
															   <component name="FacetManager">
														
 
															     <facet type="Python" name="Python">
														
 
															-      <configuration sdkName="Python 3.5 (dl_nlp)" />
														
 
															+      <configuration sdkName="Python 3.5 (BiddingKG)" />
														
 
															     </facet>
														
 
															   </component>
														
 
															   <component name="NewModuleRootManager">
														
 
															     <content url="file://$MODULE_DIR$" />
														
 
															     <orderEntry type="jdk" jdkName="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" jdkType="Python SDK" />
														
 
															     <orderEntry type="sourceFolder" forTests="false" />
														
 
															-    <orderEntry type="library" exported="" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
														
 
															+    <orderEntry type="library" name="Python 3.5 (BiddingKG) interpreter library" level="application" />
														
 
															   </component>
														
 
															 </module>
														
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -686,6 +686,35 @@ def embedding_word(datas,shape):
 
															         out_index += 1
														
 
															     return embed
														
 
															+
														
 
															+def embedding_word_forward(datas,shape):
														
 
															+    '''
														
 
															+    @summary:查找词汇对应的词向量
														
 
															+    @param:
														
 
															+        datas:词汇的list
														
 
															+        shape:结果的shape
														
 
															+    @return: array,返回对应shape的词嵌入
														
 
															+    '''
														
 
															+    model_w2v = getModel_word()
														
 
															+    embed = np.zeros(shape)
														
 
															+    length = shape[1]
														
 
															+    out_index = 0
														
 
															+    #print(datas)
														
 
															+    for data in datas:
														
 
															+        index = 0
														
 
															+        for item in str(data)[:shape[1]]:
														
 
															+            if index>=length:
														
 
															+                break
														
 
															+            if item in model_w2v.vocab:
														
 
															+                embed[out_index][index] = model_w2v[item]
														
 
															+                index += 1
														
 
															+            else:
														
 
															+                # embed[out_index][index] = model_w2v['unk']
														
 
															+                index += 1
														
 
															+        out_index += 1
														
 
															+    return embed
														
 
															+
														
 
															+
														
 
															 def formEncoding(text,shape=(100,60),expand=False):
														
 
															     embedding = np.zeros(shape)
														
 
															     word_model = getModel_word()
														
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -8,6 +8,7 @@ import time
 
															 import codecs
														
 
															 from BiddingKG.dl.ratio.re_ratio import extract_ratio
														
 
															+from BiddingKG.dl.table_head.predict import predict
														
 
															 sys.setrecursionlimit(1000000)
														
 
															 sys.path.append(os.path.abspath("../.."))
														
@@ -414,6 +415,28 @@ def tableToText(soup):
 
															         return inner_table,head_list
														
 
															+    def set_head_model(inner_table):
														
 
															+        for i in range(len(inner_table)):
														
 
															+            for j in range(len(inner_table[i])):
														
 
															+                inner_table[i][j] = inner_table[i][j][0]
														
 
															+
														
 
															+        # 模型预测表头
														
 
															+        predict_list = predict(inner_table)
														
 
															+        with open(r"C:\Users\Administrator\Desktop\table_head_test.txt", "a") as f:
														
 
															+            for i in range(len(predict_list)):
														
 
															+                f.write(str(i) + " " + str(inner_table[i]) + "\n")
														
 
															+                f.write(str(i) + " " + str(predict_list[i]) + "\n")
														
 
															+            f.write("\n")
														
 
															+
														
 
															+        # print("table_list", inner_table)
														
 
															+        # print("predict_list", predict_list)
														
 
															+
														
 
															+        for i in range(len(inner_table)):
														
 
															+            for j in range(len(inner_table[i])):
														
 
															+                inner_table[i][j] = [inner_table[i][j], int(predict_list[i][j])]
														
 
															+        head_list = sliceTable(inner_table)
														
 
															+        return inner_table, head_list
														
 
															+
														
 
															     def setHead_incontext(inner_table,pat_head,fix_value="~~",prob_min=0.5):
														
 
															         data_x,data_position = getPredictor("form").getModel("context").encode(inner_table)
														
@@ -969,7 +992,8 @@ def tableToText(soup):
 
															         if len(inner_table)>0 and len(inner_table[0])>0:
														
 
															             #inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
														
 
															             #inner_table,head_list = setHead_inline(inner_table)
														
 
															-            inner_table,head_list = setHead_initem(inner_table,pat_head)
														
 
															+            # inner_table, head_list = setHead_initem(inner_table,pat_head)
														
 
															+            inner_table, head_list = set_head_model(inner_table)
														
 
															             # inner_table,head_list = setHead_incontext(inner_table,pat_head)
														
 
															             # print(inner_table)
														
 
															             # for begin in range(len(head_list[:-1])):
														
--- a/BiddingKG/dl/table_head/checkpoints/best.hdf5
+++ b/BiddingKG/dl/table_head/checkpoints/best.hdf5
--- a/BiddingKG/dl/table_head/check_user_label_accuracy.py
+++ b/BiddingKG/dl/table_head/check_user_label_accuracy.py
@@ -8,15 +8,16 @@ def user_label_accuracy(update_user):
 
															         sql = """
														
 
															         select table_text, pre_label, post_label, id
														
 
															         from label_table_head_info 
														
 
															-        where update_user='""" + update_user + "' order by id desc limit 3000"
														
 
															+        where update_user='""" + update_user + "' order by update_time"
														
 
															     else:
														
 
															         sql = """
														
 
															         select table_text, pre_label, post_label, id
														
 
															         from label_table_head_info 
														
 
															-        where update_user='""" + update_user + "' and status = 1 and update_time >= '2022-01-17'"
														
 
															+        where update_user='""" + update_user + "' and status = 1 and update_time >= '2022-01-23'"
														
 
															     result_list = postgresql_util(sql, limit=1000000)
														
 
															     right_cnt = 0
														
 
															+    error_cnt = 0
														
 
															     error_id_list = []
														
 
															     right_id_list = []
														
 
															     i = 0
														
@@ -24,7 +25,10 @@ def user_label_accuracy(update_user):
 
															     for table in result_list:
														
 
															         i += 1
														
 
															         if i % 1000 == 0:
														
 
															-            print("Loop", i, right_cnt, time.time()-start_time)
														
 
															+            if right_cnt + error_cnt != 0:
														
 
															+                print("Loop", i, right_cnt/(right_cnt+error_cnt), time.time()-start_time)
														
 
															+            else:
														
 
															+                print("Loop", i, time.time()-start_time)
														
 
															             start_time = time.time()
														
 
															         pre_label = eval(table[1])
														
@@ -49,27 +53,44 @@ def user_label_accuracy(update_user):
 
															         else:
														
 
															             label_list = pre_label
														
 
															-        predict_label_list = predict(table_text)
														
 
															+        predict_label_list = predict(table_text, model_id=3)
														
 
															         if predict_label_list:
														
 
															             if str(label_list) == str(predict_label_list):
														
 
															-                right_cnt += 1
														
 
															                 right_id_list.append(str(_id)+"\n")
														
 
															+                # right_cnt += 1
														
 
															             else:
														
 
															-                # cnt = 0
														
 
															-                # for j in range(len(label_list)):
														
 
															-                #     row1 = label_list[j]
														
 
															-                #     row2 = predict_label_list[j]
														
 
															-                #     if str(row1) != str(row2):
														
 
															-                #         cnt += 1
														
 
															-                #     if cnt >= 2:
														
 
															-                #         error_id_list.append(str(_id)+"\n")
														
 
															-                #         break
														
 
															                 error_id_list.append(str(_id)+"\n")
														
 
															+                # error_cnt += 1
														
 
															+            if len(label_list) == len(predict_label_list):
														
 
															+                for j in range(len(label_list)):
														
 
															+                    for k in range(len(label_list[j])):
														
 
															+                        if table_text[j][k] == "":
														
 
															+                            continue
														
 
															+                        if label_list[j][k] == "1" or predict_label_list[j][k] == "1":
														
 
															+                            if len(table_text[j][k]) >= 20:
														
 
															+                                continue
														
 
															+                            else:
														
 
															+                                if label_list[j][k] == predict_label_list[j][k]:
														
 
															+                                    right_cnt += 1
														
 
															+                                else:
														
 
															+                                    error_cnt += 1
														
 
															+            else:
														
 
															+                print("len(label_list) == len(predict_label_list)", _id,
														
 
															+                      len(label_list), len(predict_label_list))
														
 
															-    accuracy = right_cnt / len(result_list)
														
 
															+    accuracy = right_cnt / (right_cnt + error_cnt)
														
 
															     print(update_user + " accuracy:", accuracy, 'total:', len(result_list))
														
 
															     print("error_id_list", len(error_id_list))
														
 
															+    save_path = "check_user_result/accuracy.txt"
														
 
															+    with open(save_path, 'a') as f:
														
 
															+        f.write(update_user + " "
														
 
															+                + "表头正确率-" + str(round(accuracy, 2)) + " "
														
 
															+                + "文章数-" + str(len(result_list)) + " "
														
 
															+                + "表格总数-" + str(right_cnt + error_cnt) + " "
														
 
															+                + "表头正确数-" + str(right_cnt)
														
 
															+                + "\n")
														
 
															+
														
 
															     save_path = "check_user_result/"+update_user+"_error.txt"
														
 
															     with open(save_path, 'w') as f:
														
 
															         f.writelines(error_id_list)
														
@@ -101,9 +122,33 @@ def get_single_result(_id):
 
															 if __name__ == '__main__':
														
 
															-    # users = ["test9", "test11", "test12", "test20", "test25", "test26", "test27"]
														
 
															-    users = ["test20", "test27"]
														
 
															+    # users = ["test9", "test11", "test12", "test25", "test26"]
														
 
															+    # users = ["test9", "test11", ]
														
 
															+    # users = ['test12', 'test25']
														
 
															+    # users = ["test20", "test27"]
														
 
															     # users = ['test']
														
 
															+    users = [
														
 
															+        "test1",
														
 
															+        "test11",
														
 
															+        "test12",
														
 
															+        "test16",
														
 
															+        "test17",
														
 
															+        "test19",
														
 
															+        "test20",
														
 
															+        "test21",
														
 
															+        "test22",
														
 
															+        "test25",
														
 
															+        "test26",
														
 
															+        "test27",
														
 
															+        "test29",
														
 
															+        "test3",
														
 
															+        "test7",
														
 
															+        "test8",
														
 
															+        "test9",
														
 
															+    ]
														
 
															+    users = ["test"]
														
 
															+    users = ["test12", "test17", "test21", "test22", "test27", ]
														
 
															+    users = ["test27"]
														
 
															     acc_list = []
														
 
															     for user in users:
														
 
															         acc = user_label_accuracy(user)
														
--- a/BiddingKG/dl/table_head/checkpoints/binary_loss/best.hdf5
+++ b/BiddingKG/dl/table_head/checkpoints/binary_loss/best.hdf5
--- a/BiddingKG/dl/table_head/checkpoints/focal_loss/best.hdf5
+++ b/BiddingKG/dl/table_head/checkpoints/focal_loss/best.hdf5
--- a/BiddingKG/dl/table_head/loss.py
+++ b/BiddingKG/dl/table_head/loss.py
@@ -15,4 +15,10 @@ def focal_loss(gamma=2., alpha=.5):
 
															                                * K.backend.log(K.backend.epsilon()+pt_1))\
														
 
															                - K.backend.sum((1-alpha) * K.backend.pow(pt_0, gamma)
														
 
															                                * K.backend.log(1. - pt_0 + K.backend.epsilon()))
														
 
															-    return f_loss
														
 
															+    return f_loss
														
 
															+
														
 
															+
														
 
															+def union_loss(gamma=2., alpha=.5):
														
 
															+    def _loss(y_true, y_pred):
														
 
															+
														
 
															+        return focal_loss(gamma, alpha)
														
--- a/BiddingKG/dl/table_head/models/layer_utils.py
+++ b/BiddingKG/dl/table_head/models/layer_utils.py
@@ -0,0 +1,272 @@
 
															+import os
														
 
															+import sys
														
 
															+import tensorflow as tf
														
 
															+from keras.callbacks import Callback
														
 
															+from keras.layers import Layer, warnings
														
 
															+import numpy as np
														
 
															+sys.path.append(os.path.dirname(__file__))
														
 
															+from pre_process import get_best_padding_size
														
 
															+
														
 
															+
														
 
															+class BatchReshape1(Layer):
														
 
															+    """
														
 
															+    将表格的行列维度合并到Batch维度中
														
 
															+    (batch, rows, cols, character_num, character_embed) -> (batch*rows*cols, character_num, character_embed)
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, character_num, character_embed):
														
 
															+        super(BatchReshape1, self).__init__()
														
 
															+        self.character_num = character_num
														
 
															+        self.character_embed = character_embed
														
 
															+
														
 
															+    def call(self, inputs, mask=None, **kwargs):
														
 
															+        batch = tf.shape(inputs)[0]
														
 
															+        height = tf.shape(inputs)[1]
														
 
															+        width = tf.shape(inputs)[2]
														
 
															+
														
 
															+        outputs = tf.reshape(inputs, (batch*height*width,
														
 
															+                                      self.character_num, self.character_embed))
														
 
															+        return outputs
														
 
															+
														
 
															+    def compute_output_shape(self, input_shape):
														
 
															+        return None, self.character_num, self.character_embed
														
 
															+
														
 
															+
														
 
															+class BatchReshape2(Layer):
														
 
															+    """
														
 
															+    将Batch维度中的行列拆分出来
														
 
															+    (batch*rows*cols, cell_embed) -> (batch, rows, cols, cell_embed)
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, cell_embed):
														
 
															+        super(BatchReshape2, self).__init__()
														
 
															+        self.cell_embed = cell_embed
														
 
															+
														
 
															+    def call(self, inputs, mask=None, **kwargs):
														
 
															+        input1 = inputs[0]
														
 
															+        input2 = inputs[1]
														
 
															+
														
 
															+        batch = tf.shape(input1)[0]
														
 
															+        height = tf.shape(input1)[1]
														
 
															+        width = tf.shape(input1)[2]
														
 
															+
														
 
															+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
														
 
															+        return outputs
														
 
															+
														
 
															+    def compute_output_shape(self, input_shape):
														
 
															+        return None, None, None, self.cell_embed
														
 
															+
														
 
															+
														
 
															+class BatchReshape3(Layer):
														
 
															+    """
														
 
															+    将表格的行维度合并到Batch维度中
														
 
															+    (batch, rows, cols, cell_embed) -> (batch*rows, cols, cell_embed)
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, cell_embed):
														
 
															+        super(BatchReshape3, self).__init__()
														
 
															+        self.cell_embed = cell_embed
														
 
															+
														
 
															+    def call(self, inputs, mask=None, **kwargs):
														
 
															+        batch = tf.shape(inputs)[0]
														
 
															+        height = tf.shape(inputs)[1]
														
 
															+        width = tf.shape(inputs)[2]
														
 
															+
														
 
															+        outputs = tf.reshape(inputs, (batch*height, width, self.cell_embed))
														
 
															+        return outputs
														
 
															+
														
 
															+    def compute_output_shape(self, input_shape):
														
 
															+        return None, None, self.cell_embed
														
 
															+
														
 
															+
														
 
															+class BatchReshape4(Layer):
														
 
															+    """
														
 
															+    将Batch维度中的行拆出来
														
 
															+    (batch*rows, cols, cell_embed) -> (batch, rows, cols, cell_embed)
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, cell_embed):
														
 
															+        super(BatchReshape4, self).__init__()
														
 
															+        self.supports_masking = True
														
 
															+        self.cell_embed = cell_embed
														
 
															+
														
 
															+    def compute_mask(self, inputs, mask=None):
														
 
															+        print(mask)
														
 
															+        # if mask[0] is None:
														
 
															+        #     return mask
														
 
															+
														
 
															+        # input1 = inputs[0]
														
 
															+        # input2 = inputs[1]
														
 
															+        # batch = tf.shape(input1)[0]
														
 
															+        # height = tf.shape(input1)[1]
														
 
															+        # width = tf.shape(input1)[2]
														
 
															+        #
														
 
															+        # mask_tensor = tf.reshape(mask[1], (batch, height, width, self.cell_embed))
														
 
															+        return mask
														
 
															+
														
 
															+    def call(self, inputs, mask=None, **kwargs):
														
 
															+        input1 = inputs[0]
														
 
															+        input2 = inputs[1]
														
 
															+
														
 
															+        batch = tf.shape(input1)[0]
														
 
															+        height = tf.shape(input1)[1]
														
 
															+        width = tf.shape(input1)[2]
														
 
															+
														
 
															+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
														
 
															+        return outputs
														
 
															+
														
 
															+    def compute_output_shape(self, input_shape):
														
 
															+        return None, None, None, self.cell_embed
														
 
															+
														
 
															+
														
 
															+class BatchReshape5(Layer):
														
 
															+    """
														
 
															+    将表格的行维度合并到Batch维度中
														
 
															+    (batch, rows, cols, cell_embed) -> (batch, rows*cols, cell_embed)
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, cell_embed):
														
 
															+        super(BatchReshape5, self).__init__()
														
 
															+        self.cell_embed = cell_embed
														
 
															+
														
 
															+    def call(self, inputs, mask=None, **kwargs):
														
 
															+        batch = tf.shape(inputs)[0]
														
 
															+        height = tf.shape(inputs)[1]
														
 
															+        width = tf.shape(inputs)[2]
														
 
															+
														
 
															+        outputs = tf.reshape(inputs, (batch, height*width, self.cell_embed))
														
 
															+        return outputs
														
 
															+
														
 
															+    def compute_output_shape(self, input_shape):
														
 
															+        return None, None, self.cell_embed
														
 
															+
														
 
															+
														
 
															+class BatchReshape6(Layer):
														
 
															+    """
														
 
															+    将Batch维度中的行拆出来
														
 
															+    (batch, rows*cols, cell_embed) -> (batch, rows, cols, cell_embed)
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, cell_embed):
														
 
															+        super(BatchReshape6, self).__init__()
														
 
															+        self.cell_embed = cell_embed
														
 
															+
														
 
															+    def call(self, inputs, mask=None, **kwargs):
														
 
															+        input1 = inputs[0]
														
 
															+        input2 = inputs[1]
														
 
															+
														
 
															+        batch = tf.shape(input1)[0]
														
 
															+        height = tf.shape(input1)[1]
														
 
															+        width = tf.shape(input1)[2]
														
 
															+
														
 
															+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
														
 
															+        return outputs
														
 
															+
														
 
															+    def compute_output_shape(self, input_shape):
														
 
															+        return None, None, None, self.cell_embed
														
 
															+
														
 
															+
														
 
															+class MyPadding(Layer):
														
 
															+    def __init__(self, pad_height, pad_width, cell_embed):
														
 
															+        super(MyPadding, self).__init__()
														
 
															+        self.pad_height = pad_height
														
 
															+        self.pad_width = pad_width
														
 
															+        self.cell_embed = cell_embed
														
 
															+
														
 
															+    def call(self, inputs, mask=None, **kwargs):
														
 
															+        batch = tf.shape(inputs)[0]
														
 
															+        height = tf.shape(inputs)[1]
														
 
															+        width = tf.shape(inputs)[2]
														
 
															+
														
 
															+        outputs = tf.pad(inputs, [[0, 0],
														
 
															+                                  [0, self.pad_height - height],
														
 
															+                                  [0, self.pad_width - width],
														
 
															+                                  [0, 0]])
														
 
															+
														
 
															+        return outputs
														
 
															+
														
 
															+    def compute_output_shape(self, input_shape):
														
 
															+        return None, None, None, self.cell_embed
														
 
															+
														
 
															+
														
 
															+class MySplit(Layer):
														
 
															+    def __init__(self, height, width, **kwargs):
														
 
															+        super(MySplit, self).__init__(**kwargs)
														
 
															+        self.height = height
														
 
															+        self.width = width
														
 
															+
														
 
															+    def call(self, inputs, mask=None, **kwargs):
														
 
															+        outputs = inputs[:, 0:self.height, 0:self.width]
														
 
															+        return outputs
														
 
															+
														
 
															+    def compute_output_shape(self, input_shape):
														
 
															+        return None, None, None
														
 
															+
														
 
															+
														
 
															+class MyModelCheckpoint(Callback):
														
 
															+    def __init__(self, filepath, monitor='val_loss', verbose=0,
														
 
															+                 save_best_only=False, save_weights_only=False,
														
 
															+                 mode='auto', period=1):
														
 
															+        super(MyModelCheckpoint, self).__init__()
														
 
															+        self.monitor = monitor
														
 
															+        self.verbose = verbose
														
 
															+        self.filepath = filepath
														
 
															+        self.save_best_only = save_best_only
														
 
															+        self.save_weights_only = save_weights_only
														
 
															+        self.period = period
														
 
															+        self.epochs_since_last_save = 0
														
 
															+
														
 
															+        if mode not in ['auto', 'min', 'max']:
														
 
															+            warnings.warn('ModelCheckpoint mode %s is unknown, '
														
 
															+                          'fallback to auto mode.' % (mode),
														
 
															+                          RuntimeWarning)
														
 
															+            mode = 'auto'
														
 
															+
														
 
															+        if mode == 'min':
														
 
															+            self.monitor_op = np.less
														
 
															+            self.best = np.Inf
														
 
															+        elif mode == 'max':
														
 
															+            self.monitor_op = np.greater
														
 
															+            self.best = -np.Inf
														
 
															+        else:
														
 
															+            if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):
														
 
															+                self.monitor_op = np.greater
														
 
															+                self.best = -np.Inf
														
 
															+            else:
														
 
															+                self.monitor_op = np.less
														
 
															+                self.best = np.Inf
														
 
															+
														
 
															+    def on_epoch_end(self, epoch, logs=None):
														
 
															+        logs = logs or {}
														
 
															+        self.epochs_since_last_save += 1
														
 
															+        if self.epochs_since_last_save >= self.period:
														
 
															+            self.epochs_since_last_save = 0
														
 
															+            filepath = self.filepath.format(epoch=epoch + 1, **logs)
														
 
															+            if self.save_best_only:
														
 
															+                current = (logs.get(self.monitor[0]) + logs.get(self.monitor[1])) / 2
														
 
															+                if current is None:
														
 
															+                    warnings.warn('Can save best model only with %s available, '
														
 
															+                                  'skipping.' % (self.monitor), RuntimeWarning)
														
 
															+                else:
														
 
															+                    if self.monitor_op(current, self.best):
														
 
															+                        if self.verbose > 0:
														
 
															+                            print('\nEpoch %05d: %s improved from %0.5f to %0.5f,'
														
 
															+                                  ' saving model to %s'
														
 
															+                                  % (epoch + 1, self.monitor, self.best,
														
 
															+                                     current, filepath))
														
 
															+                        self.best = current
														
 
															+                        if self.save_weights_only:
														
 
															+                            self.model.save_weights(filepath, overwrite=True)
														
 
															+                        else:
														
 
															+                            self.model.save(filepath, overwrite=True)
														
 
															+                    else:
														
 
															+                        if self.verbose > 0:
														
 
															+                            print('\nEpoch %05d: %s did not improve from %0.5f' %
														
 
															+                                  (epoch + 1, self.monitor, self.best))
														
 
															+            else:
														
 
															+                if self.verbose > 0:
														
 
															+                    print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
														
 
															+                if self.save_weights_only:
														
 
															+                    self.model.save_weights(filepath, overwrite=True)
														
 
															+                else:
														
 
															+                    self.model.save(filepath, overwrite=True)
														
--- a/BiddingKG/dl/table_head/models/loop_lstm.py
+++ b/BiddingKG/dl/table_head/models/loop_lstm.py
@@ -0,0 +1,232 @@
 
															+import keras
														
 
															+import tensorflow as tf
														
 
															+from keras import models, backend as K
														
 
															+from keras.layers import Layer, Input, Lambda, Concatenate, Dense, LSTM, Bidirectional
														
 
															+from tensorflow.contrib.rnn import LSTMCell
														
 
															+import numpy as np
														
 
															+
														
 
															+from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
														
 
															+from BiddingKG.dl.table_head.models.u_net import u_net_small
														
 
															+
														
 
															+
														
 
															+def attention(inputs, w_omega, b_omega, u_omega, time_major=False):
														
 
															+    if isinstance(inputs, tuple):
														
 
															+        inputs = tf.concat(inputs, 2)
														
 
															+    if time_major:  # (B,T,D) => (T,B,D)
														
 
															+        inputs = tf.transpose(inputs, [1, 0, 2])
														
 
															+    v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)
														
 
															+
														
 
															+    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
														
 
															+    alphas = tf.nn.softmax(vu, name='alphas')  # (B,T) shape
														
 
															+    # the result has (B,D) shape
														
 
															+    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
														
 
															+
														
 
															+    return output, alphas
														
 
															+
														
 
															+
														
 
															+class LoopCell(Layer):
														
 
															+    def __init__(self, hidden_size, attention_size, character_num, character_embed,
														
 
															+                 cell_embed):
														
 
															+        super(LoopCell, self).__init__()
														
 
															+
														
 
															+        # Hyper parameters
														
 
															+        self.hidden_size = hidden_size
														
 
															+        self.attention_size = attention_size
														
 
															+        self.character_num = character_num
														
 
															+        self.character_embed = character_embed
														
 
															+        self.cell_embed = cell_embed
														
 
															+
														
 
															+    def build(self, batch_input_shape):
														
 
															+        super(LoopCell, self).build(batch_input_shape)
														
 
															+
														
 
															+        # Trainable parameters
														
 
															+        # Attention
														
 
															+        # self.w_omega = self.add_weight("w_omega", shape=[self.hidden_size*2, self.attention_size],
														
 
															+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
														
 
															+        #                                trainable=True)
														
 
															+        # self.b_omega = self.add_weight("b_omega", shape=[self.attention_size],
														
 
															+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
														
 
															+        #                                trainable=True)
														
 
															+        # self.u_omega = self.add_weight("u_omega", shape=[self.attention_size],
														
 
															+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
														
 
															+        #                                trainable=True)
														
 
															+
														
 
															+        # Bi-LSTM
														
 
															+        # self.forward_cell = LSTMCell(self.hidden_size, forget_bias=1.0, state_is_tuple=True)
														
 
															+        # self.backward_cell = LSTMCell(self.hidden_size, forget_bias=1.0, state_is_tuple=True)
														
 
															+        # self.bi_lism = Bidirectional(LSTM(self.hidden_size, return_sequences=True))
														
 
															+        # self.bi_lism.build(input_shape=(None, self.character_num, self.character_embed))
														
 
															+        # self.trainable_weights += self.bi_lism.trainable_weights
														
 
															+        #
														
 
															+        # self.self_attention = SeqSelfAttention(attention_activation='sigmoid')
														
 
															+        # self.self_attention.build(input_shape=(None, self.character_num, 2*self.hidden_size))
														
 
															+        # self.trainable_weights += self.self_attention.trainable_weights
														
 
															+        # print(self.trainable_weights)
														
 
															+
														
 
															+        # DNN
														
 
															+        # self.w1 = self.add_weight('W1', [2*self.attention_size, self.cell_embed],
														
 
															+        #                           initializer=tf.random_uniform_initializer(-0.25, 0.25),
														
 
															+        #                           trainable=True)
														
 
															+        #
														
 
															+        # self.b1 = self.add_weight('b1', [self.cell_embed],
														
 
															+        #                           initializer=tf.zeros_initializer(),
														
 
															+        #                           trainable=True)
														
 
															+        # self.dense = Dense(self.cell_embed, activation="relu")
														
 
															+        # print(batch_input_shape[0], batch_input_shape[1], batch_input_shape[2])
														
 
															+        # self.dense.build(input_shape=(batch_input_shape[0]*batch_input_shape[1]*batch_input_shape[2],
														
 
															+        #                               2*self.attention_size))
														
 
															+        # self.trainable_weights += self.dense.trainable_weights
														
 
															+
														
 
															+    def call(self, inputs, mask=None, **kwargs):
														
 
															+        def fn(x):
														
 
															+            print("fn_0", x)
														
 
															+
														
 
															+            # (batch*height*width, character_num, hidden_size)
														
 
															+            # outputs, last_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=self.forward_cell,
														
 
															+            #                                                        cell_bw=self.backward_cell,
														
 
															+            #                                                        inputs=x,
														
 
															+            #                                                        dtype=tf.float32,
														
 
															+            #                                                        time_major=False)
														
 
															+            # (batch*height*width, character_num, 2*hidden_size)
														
 
															+            # outputs = self.bi_lism(x)
														
 
															+            # print("fn_1", outputs)
														
 
															+
														
 
															+            # (batch*height*width, character_num, 2*hidden_size)
														
 
															+            # outputs = self.self_attention(outputs)
														
 
															+            # print("fn_2", outputs)
														
 
															+
														
 
															+            # (batch*height*width, 2*hidden_size)
														
 
															+            # outputs, _ = attention(outputs, self.w_omega, self.b_omega,
														
 
															+            #                        self.u_omega, time_major=False)
														
 
															+
														
 
															+
														
 
															+            # (batch*height*width, cell_embedding)
														
 
															+            # outputs = tf.nn.xw_plus_b(outputs, self.w1, self.b1)
														
 
															+            # outputs = self.dense(outputs)
														
 
															+            # print("fn_3", outputs)
														
 
															+            return outputs
														
 
															+
														
 
															+        batch = tf.shape(inputs)[0]
														
 
															+        height = tf.shape(inputs)[1]
														
 
															+        width = tf.shape(inputs)[2]
														
 
															+
														
 
															+        # (batch, height*width, character_num(time_step), character_embedding)
														
 
															+        # inputs = tf.reshape(inputs, (tf.shape(inputs)[0],
														
 
															+        #                              height*width,
														
 
															+        #                              inputs.shape[3], inputs.shape[4]))
														
 
															+
														
 
															+        # (batch*height*width, character_num, character_embedding)
														
 
															+        outputs = tf.reshape(inputs, (batch*height*width,
														
 
															+                                      inputs.shape[3], inputs.shape[4]))
														
 
															+
														
 
															+        # (height*width, batch, character_num(time_step), character_embedding)
														
 
															+        # inputs = tf.transpose(inputs, (1, 0, 2, 3))
														
 
															+
														
 
															+        # split height*width, each cell
														
 
															+        # (height*width, batch, cell_embedding)
														
 
															+        # outputs = tf.map_fn(fn=lambda x: fn(x), elems=inputs, dtype=tf.float32)
														
 
															+        # print("loop_lstm_1", outputs)
														
 
															+        # outputs = tf.squeeze(outputs, 0)
														
 
															+
														
 
															+        # (batch*height*width, 2*attention_size)
														
 
															+        # outputs = fn(inputs)
														
 
															+        # print("loop_lstm_2", outputs)
														
 
															+
														
 
															+        # (1, batch*height*width, 2*attention_size)
														
 
															+        # outputs = tf.expand_dims(outputs, 0)
														
 
															+        # print("loop_lstm_3", outputs)
														
 
															+
														
 
															+        # (batch*height*width, cell_embedding)
														
 
															+        # outputs = Dense(self.cell_embed, activation="relu")(outputs)
														
 
															+        # print("loop_lstm_3", outputs)
														
 
															+
														
 
															+        # (batch, height*width, cell_embedding)
														
 
															+        # outputs = tf.transpose(outputs, (1, 0, 2))
														
 
															+        # print("loop_lstm_2", outputs)
														
 
															+
														
 
															+        # (batch, height, width, cell_embedding)
														
 
															+        # outputs = tf.reshape(outputs, (batch, height, width, self.cell_embed))
														
 
															+        # print("loop_lstm_4", outputs)
														
 
															+        return outputs
														
 
															+
														
 
															+    def compute_output_shape(self, input_shape):
														
 
															+        return None, self.character_num, self.character_embed
														
 
															+
														
 
															+
														
 
															+class BatchReshape(Layer):
														
 
															+    def __init__(self, cell_embed):
														
 
															+        super(BatchReshape, self).__init__()
														
 
															+        self.cell_embed = cell_embed
														
 
															+
														
 
															+    def call(self, inputs, mask=None, **kwargs):
														
 
															+        input1 = inputs[0]
														
 
															+        input2 = inputs[1]
														
 
															+
														
 
															+        batch = tf.shape(input1)[0]
														
 
															+        height = tf.shape(input1)[1]
														
 
															+        width = tf.shape(input1)[2]
														
 
															+
														
 
															+        # (batch, height, width, cell_embedding)
														
 
															+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
														
 
															+        print("batch_reshape", outputs)
														
 
															+        return outputs
														
 
															+
														
 
															+    def compute_output_shape(self, input_shape):
														
 
															+        return None, None, None, self.cell_embed
														
 
															+
														
 
															+
														
 
															+# def batch_reshape(x):
														
 
															+#     return K.reshape(x, (batch, height, width, cell_embed))
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    input_shape = (16, 8, 10, 60)
														
 
															+    hidden_size = 64
														
 
															+    attention_size = 64
														
 
															+    character_num = 10
														
 
															+    character_embed = 60
														
 
															+    cell_embed = 8
														
 
															+
														
 
															+    # (batch_size, row_num, col_num, character_num, character_embedding)
														
 
															+    X_train = np.random.uniform(0, 1, (10, 16, 8, 10, 60))
														
 
															+    X_test = np.random.uniform(0, 1, (10, 16, 8, 10, 60))
														
 
															+    y_train = np.random.uniform(0, 1, (10, 16, 8))
														
 
															+    y_test = np.random.uniform(0, 1, (10, 16, 8))
														
 
															+
														
 
															+    _input = Input(shape=input_shape, dtype="float32")
														
 
															+    batch = K.shape(_input)[0]
														
 
															+    height = K.shape(_input)[1]
														
 
															+    width = K.shape(_input)[2]
														
 
															+    print(batch, height, width)
														
 
															+
														
 
															+    loop_bi_lstm = LoopCell(hidden_size, attention_size,
														
 
															+                            character_num, character_embed,
														
 
															+                            cell_embed)(_input)
														
 
															+    print("model_2_1", loop_bi_lstm)
														
 
															+    dense = Dense(cell_embed, activation="relu")(loop_bi_lstm)
														
 
															+    print("model_2_2", dense)
														
 
															+    reshape = Lambda(batch_reshape, output_shape=(height, width, cell_embed))(dense)
														
 
															+    print("model_2_3", reshape)
														
 
															+    u_net = u_net_small(loop_bi_lstm)
														
 
															+    merge = Concatenate(axis=-1)([loop_bi_lstm, u_net])
														
 
															+    dense = Dense(LoopCell().cell_embed, activation='relu')(merge)
														
 
															+    dense = Dense(1, activation='sigmoid')(dense)
														
 
															+    squeeze = Lambda(lambda x: K.squeeze(x, axis=-1))(dense)
														
 
															+    model = models.Model(inputs=_input, outputs=squeeze)
														
 
															+    model.summary(line_length=120)
														
 
															+    model.compile(loss='binary_crossentropy', optimizer='adam')
														
 
															+    model.fit(X_train, y_train,
														
 
															+              epochs=2,
														
 
															+              batch_size=1,
														
 
															+              validation_data=(X_test, y_test))
														
 
															+
														
 
															+    # (batch_size, row_num, col_num, character_num, character_embedding)
														
 
															+    X_train = np.random.uniform(0, 1, (5, 32, 24, 10, 60))
														
 
															+    X_test = np.random.uniform(0, 1, (5, 32, 24, 10, 60))
														
 
															+    y_train = np.random.uniform(0, 1, (5, 32, 24))
														
 
															+    y_test = np.random.uniform(0, 1, (5, 32, 24))
														
 
															+
														
 
															+    model.fit(X_train, y_train,
														
 
															+              epochs=2,
														
 
															+              batch_size=1,
														
 
															+              validation_data=(X_test, y_test))
														
--- a/BiddingKG/dl/table_head/models/model.py
+++ b/BiddingKG/dl/table_head/models/model.py
@@ -1,16 +1,21 @@
 
															 import sys
														
 
															 import os
														
 
															+import numpy as np
														
 
															+from keras.layers import Lambda, Dense, Reshape, Bidirectional, LSTM, Conv2D, BatchNormalization, LeakyReLU, Masking
														
 
															+from keras_preprocessing.sequence import pad_sequences
														
 
															+sys.path.append(os.path.dirname(__file__))
														
 
															-from keras.layers import Lambda
														
 
															-
														
 
															-sys.path.append(os.path.abspath("../.."))
														
 
															-from keras import layers, models
														
 
															+from models.layer_utils import BatchReshape1, BatchReshape2, MyPadding, MySplit, BatchReshape3, \
														
 
															+    BatchReshape4, BatchReshape5, BatchReshape6
														
 
															+from keras import layers, models, Sequential
														
 
															 import keras.backend as K
														
 
															-from BiddingKG.dl.table_head.models.my_average_pooling import MyAveragePooling1D
														
 
															-from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
														
 
															+import tensorflow as tf
														
 
															+from models.my_average_pooling import MyAveragePooling1D
														
 
															+from models.self_attention import SeqSelfAttention, MySelfAttention
														
 
															+from models.u_net import u_net_small
														
 
															-def get_model(input_shape, output_shape):
														
 
															+def model_1(input_shape, output_shape):
														
 
															     # Input (batch, 10, 60)
														
 
															     input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
														
 
															     input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
														
@@ -67,3 +72,228 @@ def get_model(input_shape, output_shape):
 
															     model.summary()
														
 
															     return model
														
 
															+
														
 
															+
														
 
															+def model_2(input_shape, output_shape):
														
 
															+    # input_shape = (None, None, 10, 60)
														
 
															+    # (batch_size, row_num, col_num, character_num, character_embedding)
														
 
															+    hidden_size = 64
														
 
															+    attention_size = 64
														
 
															+    character_num = 10
														
 
															+    character_embed = 60
														
 
															+    cell_embed = 1
														
 
															+
														
 
															+    # Input
														
 
															+    input_1 = layers.Input(shape=input_shape, dtype="float32", name="input_1")
														
 
															+    input_2 = layers.Input(shape=(None, None, None, None), dtype="int32", name="input_2")
														
 
															+    # batch = tf.shape(_input)[0]
														
 
															+    height = tf.shape(input_2)[1]
														
 
															+    width = tf.shape(input_2)[2]
														
 
															+    pad_height = tf.shape(input_2)[3]
														
 
															+    pad_width = tf.shape(input_2)[4]
														
 
															+
														
 
															+    # print("batch, height, width", batch, height, width)
														
 
															+
														
 
															+    # Reshape
														
 
															+    reshape = BatchReshape1(character_num, character_embed)(input_1)
														
 
															+    print("model_2_0", reshape)
														
 
															+
														
 
															+    # Bi-LSTM + Attention
														
 
															+    bi_lstm = Bidirectional(LSTM(hidden_size))(reshape)
														
 
															+    print("model_2_1", bi_lstm)
														
 
															+    # bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape)
														
 
															+    # self_attention = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm)
														
 
															+    # trans = Lambda(lambda x: tf.transpose(x, (0, 2, 1)))(self_attention)
														
 
															+    # dense = Dense(1, activation='relu')(trans)
														
 
															+    # squeeze = Lambda(lambda x: tf.squeeze(x, -1))(dense)
														
 
															+
														
 
															+    dense = Dense(1, activation="sigmoid")(bi_lstm)
														
 
															+    print("model_2_2", dense)
														
 
															+    # reshape = Lambda(batch_reshape, output_shape=(height, width, cell_embed))(dense)
														
 
															+    reshape = BatchReshape2(cell_embed)([input_1, dense])
														
 
															+    print("model_2_3", reshape)
														
 
															+    # squeeze_1 = Lambda(lambda x: K.squeeze(x, axis=-1), name="output_1")(reshape)
														
 
															+    # print("model_2_4", squeeze)
														
 
															+
														
 
															+    # Padding
														
 
															+    padding = MyPadding(pad_height, pad_width, cell_embed)(reshape)
														
 
															+    # padding = reshape
														
 
															+    print("model_2_4", padding)
														
 
															+
														
 
															+    # U-Net
														
 
															+    # u_net = u_net_small(padding)
														
 
															+    # print("model_2_5", u_net)
														
 
															+
														
 
															+    # Conv 5*5
														
 
															+    conv = Conv2D(1, (5, 5), padding='same')(padding)
														
 
															+    bn = BatchNormalization()(conv)
														
 
															+    relu = LeakyReLU(alpha=0.)(bn)
														
 
															+
														
 
															+    conv = Conv2D(1, (5, 5), padding='same')(relu)
														
 
															+    bn = BatchNormalization()(conv)
														
 
															+    relu = LeakyReLU(alpha=0.)(bn)
														
 
															+
														
 
															+    conv = Conv2D(1, (5, 5), padding='same')(relu)
														
 
															+    bn = BatchNormalization()(conv)
														
 
															+    relu_1 = LeakyReLU(alpha=0.)(bn)
														
 
															+
														
 
															+    # Conv 3*3
														
 
															+    conv = Conv2D(1, (3, 3), padding='same')(padding)
														
 
															+    bn = BatchNormalization()(conv)
														
 
															+    relu = LeakyReLU(alpha=0.)(bn)
														
 
															+
														
 
															+    conv = Conv2D(1, (3, 3), padding='same')(relu)
														
 
															+    bn = BatchNormalization()(conv)
														
 
															+    relu = LeakyReLU(alpha=0.)(bn)
														
 
															+
														
 
															+    conv = Conv2D(1, (3, 3), padding='same')(relu)
														
 
															+    bn = BatchNormalization()(conv)
														
 
															+    relu_2 = LeakyReLU(alpha=0.)(bn)
														
 
															+
														
 
															+    # Conv 1*1
														
 
															+    conv = Conv2D(1, (1, 1), padding='same')(padding)
														
 
															+    bn = BatchNormalization()(conv)
														
 
															+    relu = LeakyReLU(alpha=0.)(bn)
														
 
															+
														
 
															+    conv = Conv2D(1, (1, 1), padding='same')(relu)
														
 
															+    bn = BatchNormalization()(conv)
														
 
															+    relu = LeakyReLU(alpha=0.)(bn)
														
 
															+
														
 
															+    conv = Conv2D(1, (1, 1), padding='same')(relu)
														
 
															+    bn = BatchNormalization()(conv)
														
 
															+    relu_3 = LeakyReLU(alpha=0.)(bn)
														
 
															+
														
 
															+    # conv = Conv2D(cell_embed, (3, 3), padding='same')(relu)
														
 
															+    # bn = BatchNormalization()(conv)
														
 
															+    # relu_2 = LeakyReLU(alpha=0.)(bn)
														
 
															+
														
 
															+    # Merge
														
 
															+    # print("model_2_5", relu_1, relu_2)
														
 
															+    merge = layers.Concatenate(axis=-1)([relu_1, relu_2, relu_3])
														
 
															+    # merge = u_net
														
 
															+    # merge = relu
														
 
															+    dense = layers.Dense(1, activation='sigmoid')(merge)
														
 
															+    squeeze_2 = Lambda(lambda x: K.squeeze(x, axis=-1))(dense)
														
 
															+
														
 
															+    # Split
														
 
															+    split = MySplit(height, width, name="output")(squeeze_2)
														
 
															+
														
 
															+    model = models.Model(inputs=[input_1, input_2], outputs=split)
														
 
															+    model.summary(line_length=120)
														
 
															+    return model
														
 
															+
														
 
															+
														
 
															+def model_3(input_shape, output_shape):
														
 
															+    # (batch_size, row_num, col_num, character_num, character_embedding)
														
 
															+
														
 
															+    hidden_size = 16
														
 
															+    attention_size = 2*hidden_size
														
 
															+    character_num = 20
														
 
															+    character_embed = 60
														
 
															+    cell_embed = 2*hidden_size
														
 
															+    pad_len = 100
														
 
															+    mask_timestamps = pad_len
														
 
															+
														
 
															+    # Input
														
 
															+    input_1 = layers.Input(shape=input_shape, dtype="float32", name="input_1")
														
 
															+    input_2 = layers.Input(shape=(None, None, None, None), dtype="int32", name="input_2")
														
 
															+
														
 
															+    # Reshape
														
 
															+    reshape = BatchReshape1(character_num, character_embed)(input_1)
														
 
															+    print("model_2_0", reshape)
														
 
															+
														
 
															+    # Bi-LSTM
														
 
															+    bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape)
														
 
															+    bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=False))(bi_lstm)
														
 
															+    print("model_2_1", bi_lstm)
														
 
															+
														
 
															+    # Reshape
														
 
															+    reshape = BatchReshape2(cell_embed)([input_1, bi_lstm])
														
 
															+    print("model_2_3", reshape)
														
 
															+
														
 
															+    # Rows Reshape
														
 
															+    reshape_1 = BatchReshape3(cell_embed)(reshape)
														
 
															+
														
 
															+    # Cols Reshape
														
 
															+    trans = Lambda(lambda x: tf.transpose(x, (0, 2, 1, 3)))(reshape)
														
 
															+    reshape_2 = BatchReshape3(cell_embed)(trans)
														
 
															+
														
 
															+    # All boxes Reshape
														
 
															+    reshape_3 = BatchReshape5(cell_embed)(reshape)
														
 
															+
														
 
															+    # Masking
														
 
															+    # mask_1 = Masking(mask_value=-1, input_shape=(mask_timestamps, cell_embed))(pad_1)
														
 
															+    # mask_2 = Masking(mask_value=-1, input_shape=(mask_timestamps, cell_embed))(pad_2)
														
 
															+    # print("model_2_4", mask_1)
														
 
															+
														
 
															+    # Padding
														
 
															+    # pad_1 = MyPadding()
														
 
															+
														
 
															+    # Bi-LSTM
														
 
															+    # bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))
														
 
															+    # bi_lstm_1 = bi_lstm(reshape_1)
														
 
															+    # bi_lstm_2 = bi_lstm(reshape_2)
														
 
															+    bi_lstm_1 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_1)
														
 
															+    bi_lstm_2 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_2)
														
 
															+    # bi_lstm_1 = LSTM(2*hidden_size, return_sequences=True)(reshape_1)
														
 
															+    # print("model_2_4", bi_lstm_1)
														
 
															+    # bi_lstm_2 = LSTM(2*hidden_size, return_sequences=True)(reshape_2)
														
 
															+    # self_attention_1 = MySelfAttention(output_dim=attention_size)(bi_lstm_1)
														
 
															+    # self_attention_2 = MySelfAttention(output_dim=attention_size)(bi_lstm_2)
														
 
															+
														
 
															+    # Bi-LSTM + Attention
														
 
															+    bi_lstm_3 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_3)
														
 
															+    # bi_lstm_3 = LSTM(2*hidden_size, return_sequences=True)(reshape_3)
														
 
															+    # self_attention_3 = MySelfAttention(output_dim=attention_size)(bi_lstm_3)
														
 
															+    # print("model_2_5", bi_lstm_1)
														
 
															+
														
 
															+    # Reshape
														
 
															+    reshape_1 = BatchReshape4(cell_embed)([reshape, bi_lstm_1])
														
 
															+    reshape_2 = BatchReshape4(cell_embed)([trans, bi_lstm_2])
														
 
															+    reshape_2 = Lambda(lambda x: tf.transpose(x, (0, 2, 1, 3)))(reshape_2)
														
 
															+    reshape_3 = BatchReshape6(cell_embed)([reshape, bi_lstm_3])
														
 
															+    print("model_2_6", reshape_1)
														
 
															+
														
 
															+    # Merge
														
 
															+    merge = layers.Concatenate(axis=-1)([reshape, reshape_1, reshape_2, reshape_3])
														
 
															+    dense = layers.Dense(hidden_size, activation='relu')(merge)
														
 
															+    dense = layers.Dense(1, activation='sigmoid')(dense)
														
 
															+    squeeze = Lambda(lambda x: K.squeeze(x, axis=-1), name="output")(dense)
														
 
															+
														
 
															+    model = models.Model(inputs=[input_1, input_2], outputs=squeeze)
														
 
															+    model.summary(line_length=110)
														
 
															+    return model
														
 
															+
														
 
															+
														
 
															+def get_model(input_shape, output_shape, model_id):
														
 
															+    if model_id == 1:
														
 
															+        return model_1(input_shape, output_shape)
														
 
															+    elif model_id == 2:
														
 
															+        return model_2(input_shape, output_shape)
														
 
															+    elif model_id == 3:
														
 
															+        return model_3(input_shape, output_shape)
														
 
															+    else:
														
 
															+        print("No such model!")
														
 
															+        raise Exception()
														
 
															+
														
 
															+
														
 
															+def test_layer():
														
 
															+    model = Sequential()
														
 
															+    model.add(Masking(mask_value=-1, input_shape=(5, 8)))
														
 
															+    model.add(Lambda(lambda x: pad_sequences(x, maxlen=100, dtype='float32',
														
 
															+                                             padding='post', truncating='post',
														
 
															+                                             value=-1)))
														
 
															+    model.add(Masking(mask_value=-1, input_shape=(5, 8)))
														
 
															+    model.add(LSTM(32, return_sequences=True))
														
 
															+
														
 
															+    model.compile(optimizer='sgd', loss='mse')
														
 
															+
														
 
															+    x = np.zeros([1, 5, 8])
														
 
															+    print(x.shape)
														
 
															+    y = np.zeros([1, 5, 32])
														
 
															+    model.summary()
														
 
															+    model.fit(x, y, batch_size=32, epochs=10)
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    test_layer()
														
--- a/BiddingKG/dl/table_head/models/model_2.py
+++ b/BiddingKG/dl/table_head/models/model_2.py
@@ -0,0 +1,58 @@
 
															+import sys
														
 
															+import os
														
 
															+sys.path.append(os.path.abspath("../.."))
														
 
															+from keras import layers, models
														
 
															+import tensorflow as tf
														
 
															+from BiddingKG.dl.table_head.models.my_average_pooling import MyAveragePooling1D
														
 
															+from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
														
 
															+
														
 
															+
														
 
															+def get_model(input_shape, output_shape):
														
 
															+    # Input
														
 
															+    input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
														
 
															+    input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
														
 
															+    input_3 = layers.Input(shape=input_shape[1:], dtype="float32")
														
 
															+    input_4 = layers.Input(shape=input_shape[1:], dtype="float32")
														
 
															+    input_5 = layers.Input(shape=input_shape[1:], dtype="float32")
														
 
															+    input_6 = layers.Input(shape=input_shape[1:], dtype="float32")
														
 
															+
														
 
															+    # Bi-LSTM
														
 
															+    bi_lstm_1 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_1)
														
 
															+    bi_lstm_2 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_2)
														
 
															+    bi_lstm_3 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_3)
														
 
															+    bi_lstm_4 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_4)
														
 
															+    bi_lstm_5 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_5)
														
 
															+    bi_lstm_6 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_6)
														
 
															+
														
 
															+    # Self-Attention
														
 
															+    self_attention_1 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_1)
														
 
															+    self_attention_2 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_2)
														
 
															+    self_attention_3 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_3)
														
 
															+    self_attention_4 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_4)
														
 
															+    self_attention_5 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_5)
														
 
															+    self_attention_6 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_6)
														
 
															+
														
 
															+    # Concat
														
 
															+    concat_1 = layers.concatenate([self_attention_1, self_attention_2, self_attention_3])
														
 
															+    concat_2 = layers.concatenate([self_attention_4, self_attention_5, self_attention_6])
														
 
															+
														
 
															+    # Dense + Sigmoid
														
 
															+    dense_1 = layers.Dense(output_shape[0], activation="sigmoid")(concat_1)
														
 
															+    dense_2 = layers.Dense(output_shape[0], activation="sigmoid")(concat_2)
														
 
															+
														
 
															+    # mask mean pooling
														
 
															+    pool_1 = MyAveragePooling1D(axis=1)(dense_1)
														
 
															+    pool_2 = MyAveragePooling1D(axis=1)(dense_2)
														
 
															+
														
 
															+    # Concat
														
 
															+    concat = layers.concatenate([pool_1, pool_2])
														
 
															+
														
 
															+    # Dense
														
 
															+    output = layers.Dense(10)(concat)
														
 
															+    output = layers.Dense(1, activation="sigmoid", name='output')(output)
														
 
															+
														
 
															+    model = models.Model(inputs=[input_1, input_2, input_3, input_4, input_5, input_6],
														
 
															+                         outputs=output)
														
 
															+
														
 
															+    model.summary()
														
 
															+    return model
														
--- a/BiddingKG/dl/table_head/models/self_attention.py
+++ b/BiddingKG/dl/table_head/models/self_attention.py
@@ -1,5 +1,6 @@
 
															 import keras
														
 
															 from keras import backend as K
														
 
															+from keras.layers import Layer
														
 
															 class SeqSelfAttention(keras.layers.Layer):
														
@@ -237,4 +238,42 @@ class SeqSelfAttention(keras.layers.Layer):
 
															     @staticmethod
														
 
															     def get_custom_objects():
														
 
															-        return {'SeqSelfAttention': SeqSelfAttention}
														
 
															+        return {'SeqSelfAttention': SeqSelfAttention}
														
 
															+
														
 
															+
														
 
															+class MySelfAttention(Layer):
														
 
															+    def __init__(self, output_dim, **kwargs):
														
 
															+        self.output_dim = output_dim
														
 
															+        super(MySelfAttention, self).__init__(**kwargs)
														
 
															+
														
 
															+    def build(self, input_shape):
														
 
															+        # inputs.shape = (batch_size, time_steps, seq_len)
														
 
															+        self.W_Q = self.add_weight(name='W_Q',
														
 
															+                                   shape=(input_shape[2], self.output_dim),
														
 
															+                                   initializer='uniform',
														
 
															+                                   trainable=True)
														
 
															+        self.W_K = self.add_weight(name='W_K',
														
 
															+                                   shape=(input_shape[2], self.output_dim),
														
 
															+                                   initializer='uniform',
														
 
															+                                   trainable=True)
														
 
															+        self.W_V = self.add_weight(name='W_V',
														
 
															+                                   shape=(input_shape[2], self.output_dim),
														
 
															+                                   initializer='uniform',
														
 
															+                                   trainable=True)
														
 
															+
														
 
															+        super(MySelfAttention, self).build(input_shape)
														
 
															+
														
 
															+    def call(self, x, mask=None, **kwargs):
														
 
															+        _Q = K.dot(x, self.W_Q)
														
 
															+        _K = K.dot(x, self.W_K)
														
 
															+        _V = K.dot(x, self.W_V)
														
 
															+
														
 
															+        # batch_dot代替K.T
														
 
															+        _Z = K.batch_dot(_Q, K.permute_dimensions(_K, [0, 2, 1]))
														
 
															+        _Z = _Z / (self.output_dim**0.5)
														
 
															+        _Z = K.softmax(_Z)
														
 
															+        _Z = K.batch_dot(_Z, _V)
														
 
															+        return _Z
														
 
															+
														
 
															+    def compute_output_shape(self, input_shape):
														
 
															+        return input_shape[0], input_shape[1], self.output_dim
														
--- a/BiddingKG/dl/table_head/models/tf_bi_lstm.py
+++ b/BiddingKG/dl/table_head/models/tf_bi_lstm.py
@@ -0,0 +1,49 @@
 
															+import tensorflow as tf
														
 
															+from tensorflow.contrib.rnn import LSTMCell
														
 
															+from tensorflow.contrib.rnn import MultiRNNCell
														
 
															+
														
 
															+
														
 
															+class LstmBase:
														
 
															+    """
														
 
															+    build rnn cell
														
 
															+    """
														
 
															+    def build_rnn(self, hidden_size, num_layes):
														
 
															+        cells = []
														
 
															+        for i in range(num_layes):
														
 
															+            cell = LSTMCell(num_units=hidden_size,
														
 
															+                            state_is_tuple=True,
														
 
															+                            initializer=tf.random_uniform_initializer(-0.25, 0.25))
														
 
															+            cells.append(cell)
														
 
															+        cells = MultiRNNCell(cells, state_is_tuple=True)
														
 
															+
														
 
															+        return cells
														
 
															+
														
 
															+
														
 
															+class BiLstm(LstmBase):
														
 
															+    """
														
 
															+    define the lstm
														
 
															+    """
														
 
															+    def __init__(self, scope_name, hidden_size, num_layers):
														
 
															+        super(BiLstm, self).__init__()
														
 
															+        assert hidden_size % 2 == 0
														
 
															+        hidden_size /= 2
														
 
															+
														
 
															+        self.fw_rnns = []
														
 
															+        self.bw_rnns = []
														
 
															+        for i in range(num_layers):
														
 
															+            self.fw_rnns.append(self.build_rnn(hidden_size, 1))
														
 
															+            self.bw_rnns.append(self.build_rnn(hidden_size, 1))
														
 
															+
														
 
															+        self.scope_name = scope_name
														
 
															+
														
 
															+    def __call__(self, input, input_len):
														
 
															+        for idx, (fw_rnn, bw_rnn) in enumerate(zip(self.fw_rnns, self.bw_rnns)):
														
 
															+            scope_name = '{}_{}'.format(self.scope_name, idx)
														
 
															+            ctx, _ = tf.nn.bidirectional_dynamic_rnn(
														
 
															+                fw_rnn, bw_rnn, input, sequence_length=input_len,
														
 
															+                dtype=tf.float32, time_major=False,
														
 
															+                scope=scope_name
														
 
															+            )
														
 
															+            input = tf.concat(ctx, -1)
														
 
															+        ctx = input
														
 
															+        return ctx
														
--- a/BiddingKG/dl/table_head/models/u_net.py
+++ b/BiddingKG/dl/table_head/models/u_net.py
@@ -0,0 +1,82 @@
 
															+from keras.layers import Input, concatenate, Conv2D, MaxPooling2D, BatchNormalization, UpSampling2D
														
 
															+from keras.layers import LeakyReLU
														
 
															+
														
 
															+
														
 
															+def u_net_small(inputs, num_classes=1):
														
 
															+    # 8
														
 
															+    use_bias = False
														
 
															+    down0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(inputs)
														
 
															+    down0 = BatchNormalization()(down0)
														
 
															+    down0 = LeakyReLU(alpha=0.)(down0)
														
 
															+    down0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(down0)
														
 
															+    down0 = BatchNormalization()(down0)
														
 
															+    down0 = LeakyReLU(alpha=0.)(down0)
														
 
															+    down0_pool = MaxPooling2D((2, 2), strides=(2, 2))(down0)
														
 
															+
														
 
															+    # 4
														
 
															+    down1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(down0_pool)
														
 
															+    down1 = BatchNormalization()(down1)
														
 
															+    down1 = LeakyReLU(alpha=0.)(down1)
														
 
															+    down1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(down1)
														
 
															+    down1 = BatchNormalization()(down1)
														
 
															+    down1 = LeakyReLU(alpha=0.)(down1)
														
 
															+    down1_pool = MaxPooling2D((2, 2), strides=(2, 2))(down1)
														
 
															+
														
 
															+    # 2
														
 
															+    down2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(down1_pool)
														
 
															+    down2 = BatchNormalization()(down2)
														
 
															+    down2 = LeakyReLU(alpha=0.)(down2)
														
 
															+    down2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(down2)
														
 
															+    down2 = BatchNormalization()(down2)
														
 
															+    down2 = LeakyReLU(alpha=0.)(down2)
														
 
															+    down2_pool = MaxPooling2D((2, 2), strides=(2, 2))(down2)
														
 
															+
														
 
															+    center = Conv2D(64, (3, 3), padding='same', use_bias=use_bias)(down2_pool)
														
 
															+    center = BatchNormalization()(center)
														
 
															+    center = LeakyReLU(alpha=0.)(center)
														
 
															+    center = Conv2D(64, (3, 3), padding='same', use_bias=use_bias)(center)
														
 
															+    center = BatchNormalization()(center)
														
 
															+    center = LeakyReLU(alpha=0.)(center)
														
 
															+
														
 
															+    # 2
														
 
															+    up2 = UpSampling2D((2, 2))(center)
														
 
															+    up2 = concatenate([down2, up2], axis=3)
														
 
															+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
														
 
															+    up2 = BatchNormalization()(up2)
														
 
															+    up2 = LeakyReLU(alpha=0.)(up2)
														
 
															+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
														
 
															+    up2 = BatchNormalization()(up2)
														
 
															+    up2 = LeakyReLU(alpha=0.)(up2)
														
 
															+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
														
 
															+    up2 = BatchNormalization()(up2)
														
 
															+    up2 = LeakyReLU(alpha=0.)(up2)
														
 
															+
														
 
															+    # 4
														
 
															+    up1 = UpSampling2D((2, 2))(up2)
														
 
															+    up1 = concatenate([down1, up1], axis=3)
														
 
															+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
														
 
															+    up1 = BatchNormalization()(up1)
														
 
															+    up1 = LeakyReLU(alpha=0.)(up1)
														
 
															+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
														
 
															+    up1 = BatchNormalization()(up1)
														
 
															+    up1 = LeakyReLU(alpha=0.)(up1)
														
 
															+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
														
 
															+    up1 = BatchNormalization()(up1)
														
 
															+    up1 = LeakyReLU(alpha=0.)(up1)
														
 
															+
														
 
															+    # 8
														
 
															+    up0 = UpSampling2D((2, 2))(up1)
														
 
															+    up0 = concatenate([down0, up0], axis=3)
														
 
															+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
														
 
															+    up0 = BatchNormalization()(up0)
														
 
															+    up0 = LeakyReLU(alpha=0.)(up0)
														
 
															+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
														
 
															+    up0 = BatchNormalization()(up0)
														
 
															+    up0 = LeakyReLU(alpha=0.)(up0)
														
 
															+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
														
 
															+    up0 = BatchNormalization()(up0)
														
 
															+    up0 = LeakyReLU(alpha=0.)(up0)
														
 
															+
														
 
															+    # classify
														
 
															+    # classify = Conv2D(num_classes, (1, 1), activation='sigmoid')(up0)
														
 
															+    return up0
														
--- a/BiddingKG/dl/table_head/post_process.py
+++ b/BiddingKG/dl/table_head/post_process.py
@@ -24,3 +24,21 @@ def table_post_process(table_text_list, predict_result, threshold=0.5):
 
															         print("table_post_process 输出label维度与text不一致!")
														
 
															         table_label_list = []
														
 
															     return table_label_list
														
 
															+
														
 
															+
														
 
															+def table_post_process_2(table_text_list, predict_result, threshold=0.5):
														
 
															+    predict_result = predict_result.tolist()[0]
														
 
															+    predict_list = []
														
 
															+    for row in predict_result:
														
 
															+        new_row = []
														
 
															+        for col in row:
														
 
															+            if col >= threshold:
														
 
															+                new_row.append("1")
														
 
															+            else:
														
 
															+                new_row.append("0")
														
 
															+        predict_list.append(new_row)
														
 
															+
														
 
															+    if len(predict_list) != len(predict_result):
														
 
															+        print("table_post_process 输出label维度与text不一致!")
														
 
															+        predict_list = []
														
 
															+    return predict_list
														
--- a/BiddingKG/dl/table_head/postgresql2csv.py
+++ b/BiddingKG/dl/table_head/postgresql2csv.py
@@ -21,23 +21,29 @@ def eval_text_list(table_text):
 
															 def read_postgresql(txt_name, start_id, _time):
														
 
															     conn = psycopg2.connect(database="table_head_label", user="postgres",
														
 
															                             password="postgres", host="192.168.2.103", port="5432")
														
 
															-
														
 
															-    with open('check_user_result/' + txt_name, "r") as f:
														
 
															-        id_list = f.readlines()
														
 
															-    # with open('check_user_result/test27.txt', "r") as f:
														
 
															-    #     id_list += f.readlines()
														
 
															-
														
 
															-    _list = []
														
 
															-    for _id in id_list:
														
 
															-        _id = _id[:-1]
														
 
															-        sql = 'select * from label_table_head_info where id =' + _id
														
 
															+    row_list = []
														
 
															+    if txt_name == "":
														
 
															+        sql = """
														
 
															+        select * from "label_table_head_info" 
														
 
															+        where status = 1 and update_time >= '2022-01-17';
														
 
															+        """
														
 
															         df = pd.read_sql(sql=sql, con=conn)
														
 
															-        # df = df[0]
														
 
															         for index, row in df.iterrows():
														
 
															-            _list.append([x for x in row])
														
 
															-    cnt = 0
														
 
															+            row_list.append([x for x in row])
														
 
															+    else:
														
 
															+        with open('check_user_result/' + txt_name, "r") as f:
														
 
															+            id_list = f.readlines()
														
 
															+        for _id in id_list:
														
 
															+            _id = _id[:-1]
														
 
															+            sql = 'select * from label_table_head_info where id =' + _id
														
 
															+            df = pd.read_sql(sql=sql, con=conn)
														
 
															+            # df = df[0]
														
 
															+            for index, row in df.iterrows():
														
 
															+                row_list.append([x for x in row])
														
 
															+        cnt = 0
														
 
															+
														
 
															     new_list = []
														
 
															-    for line in _list:
														
 
															+    for line in row_list:
														
 
															         try:
														
 
															             table_text = eval_text_list(line[2])
														
 
															         except:
														
@@ -57,7 +63,7 @@ def read_postgresql(txt_name, start_id, _time):
 
															         label_list = predict(table_text)
														
 
															         line[3] = str(label_list)
														
 
															         new_list.append(line)
														
 
															-    df = pd.DataFrame(_list)
														
 
															+    df = pd.DataFrame(new_list)
														
 
															     new_csv_path = "data_new.csv"
														
 
															     df.to_csv(new_csv_path, index=False)
														
@@ -66,7 +72,7 @@ def read_postgresql(txt_name, start_id, _time):
 
															 if __name__ == '__main__':
														
 
															-    new_csv_path = read_postgresql('test20_error.txt', 203995, '2022-01-01 00:00:00')
														
 
															+    new_csv_path = read_postgresql('test11_error.txt', 206863, '2021-12-31 00:00:00')
														
 
															     # new_csv_path = read_postgresql('test20_right.txt', 203995, '')
														
 
															     # df = pd.read_csv('data_new.csv')
														
 
															     # print(df.iloc[:, 4])
														
--- a/BiddingKG/dl/table_head/pre_process.py
+++ b/BiddingKG/dl/table_head/pre_process.py
@@ -1,8 +1,10 @@
 
															+import os
														
 
															 import random
														
 
															-
														
 
															+import sys
														
 
															 import psycopg2
														
 
															 import numpy as np
														
 
															-from BiddingKG.dl.common.Utils import embedding_word
														
 
															+sys.path.append(os.path.dirname(__file__) + "/../")
														
 
															+from common.Utils import embedding_word, embedding_word_forward
														
 
															 def get_sentence_index_list(sentence, dict_path='utils/ppocr_keys_v1.txt'):
														
@@ -42,22 +44,28 @@ def postgresql_util(sql, limit):
 
															     return all_rows
														
 
															-def get_data_from_sql(dim=10):
														
 
															+def get_data_from_sql(dim=10, whole_table=False, padding=True):
														
 
															+    sql = """
														
 
															+    select table_text, pre_label, post_label, id
														
 
															+    from label_table_head_info
														
 
															+    where status = 0 and (update_user='test9' or update_user='test1' or update_user='test7' or update_user='test26')
														
 
															+    ;
														
 
															+    """
														
 
															     # sql = """
														
 
															     # select table_text, pre_label, post_label, id
														
 
															     # from label_table_head_info
														
 
															-    # where update_user <> 'test27' and update_user <> 'test20' and table_box_cnt >= 4 and table_box_cnt <= 200
														
 
															+    # where status = 1 and update_time >= '2022-01-17' and update_time <= '2022-01-22'
														
 
															     # ;
														
 
															     # """
														
 
															-    sql = """
														
 
															-    select table_text, pre_label, post_label, id
														
 
															-    from label_table_head_info 
														
 
															-    where status = 1 and update_time >= '2022-01-17'
														
 
															-    ;
														
 
															-    """
														
 
															     result_list = postgresql_util(sql, limit=1000000)
														
 
															+    # 需排除的id
														
 
															+    with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
														
 
															+        delete_id_list = eval(f.read())
														
 
															+    with open(r"C:\Users\Administrator\Desktop\table_delete.txt", "r") as f:
														
 
															+        delete_id_list += eval(f.read())
														
 
															+
														
 
															     all_data_list = []
														
 
															     all_data_label_list = []
														
 
															     i = 0
														
@@ -71,6 +79,10 @@ def get_data_from_sql(dim=10):
 
															         post_label = eval(table[2])
														
 
															         _id = table[3]
														
 
															+        if _id in delete_id_list:
														
 
															+            print("pass", _id)
														
 
															+            continue
														
 
															+
														
 
															         # table_text需要特殊处理
														
 
															         try:
														
 
															             table_text = table[0]
														
@@ -84,17 +96,35 @@ def get_data_from_sql(dim=10):
 
															             print("无法识别table_text", _id)
														
 
															             continue
														
 
															-        # 只有一行的也不要
														
 
															-        if len(post_label) >= 2:
														
 
															-            data_list, data_label_list = table_pre_process(table_text, post_label, _id)
														
 
															-        elif len(pre_label) >= 2:
														
 
															-            data_list, data_label_list = table_pre_process(table_text, pre_label, _id)
														
 
															+        if whole_table:
														
 
															+            if len(post_label) >= 2:
														
 
															+                data_list, data_label_list = table_pre_process_2(table_text, post_label,
														
 
															+                                                                 _id, padding=padding)
														
 
															+            elif len(pre_label) >= 2:
														
 
															+                data_list, data_label_list = table_pre_process_2(table_text, pre_label,
														
 
															+                                                                 _id, padding=padding)
														
 
															+            else:
														
 
															+                data_list, data_label_list = [], []
														
 
															         else:
														
 
															-            data_list, data_label_list = [], []
														
 
															+            # 只有一行的也不要
														
 
															+            if len(post_label) >= 2:
														
 
															+                data_list, data_label_list = table_pre_process(table_text, post_label, _id)
														
 
															+            elif len(pre_label) >= 2:
														
 
															+                data_list, data_label_list = table_pre_process(table_text, pre_label, _id)
														
 
															+            else:
														
 
															+                data_list, data_label_list = [], []
														
 
															         all_data_list += data_list
														
 
															         all_data_label_list += data_label_list
														
 
															+    # 按维度大小排序
														
 
															+    if whole_table:
														
 
															+        _list = []
														
 
															+        for data, label in zip(all_data_list, all_data_label_list):
														
 
															+            _list.append([data, label])
														
 
															+        _list.sort(key=lambda x: (len(x[0]), len(x[0][0])))
														
 
															+        all_data_list[:], all_data_label_list[:] = zip(*_list)
														
 
															+
														
 
															     print("len(all_data_list)", len(all_data_list))
														
 
															     return all_data_list, all_data_label_list
														
@@ -206,7 +236,84 @@ def table_pre_process(text_list, label_list, _id, is_train=True):
 
															         return data_list
														
 
															-def get_data_from_file(file_type):
														
 
															+def table_pre_process_2(text_list, label_list, _id, is_train=True, padding=True):
														
 
															+    """
														
 
															+    表格处理，整个表格为一个数组，且填充长宽维度
														
 
															+
														
 
															+    :param text_list:
														
 
															+    :param label_list:
														
 
															+    :param _id:
														
 
															+    :param is_train:
														
 
															+    :return:
														
 
															+    """
														
 
															+    # 判断表格长宽是否合理
														
 
															+    row_len = len(text_list)
														
 
															+    best_row_len = get_best_padding_size(row_len, min_len=8)
														
 
															+    col_len = len(text_list[0])
														
 
															+    best_col_len = get_best_padding_size(col_len, min_len=8)
														
 
															+    if best_row_len is None:
														
 
															+        if is_train:
														
 
															+            return [], []
														
 
															+        else:
														
 
															+            return []
														
 
															+    if best_col_len is None:
														
 
															+        if is_train:
														
 
															+            return [], []
														
 
															+        else:
														
 
															+            return []
														
 
															+
														
 
															+    if is_train:
														
 
															+        if len(text_list) != len(label_list):
														
 
															+            print("文字单元格与标注单元格数量不匹配！", _id)
														
 
															+            print("len(text_list)", len(text_list), "len(label_list)", len(label_list))
														
 
															+            return [], []
														
 
															+
														
 
															+        if padding:
														
 
															+            for i in range(row_len):
														
 
															+                col_len = len(text_list[i])
														
 
															+                text_list[i] += [None]*(best_col_len-col_len)
														
 
															+                if is_train:
														
 
															+                    label_list[i] += ["0"]*(best_col_len-col_len)
														
 
															+            text_list += [[None]*best_col_len]*(best_row_len-row_len)
														
 
															+            if is_train:
														
 
															+                label_list += [["0"]*best_col_len]*(best_row_len-row_len)
														
 
															+
														
 
															+    if is_train:
														
 
															+        for i in range(len(label_list)):
														
 
															+            for j in range(len(label_list[i])):
														
 
															+                label_list[i][j] = int(label_list[i][j])
														
 
															+        return [text_list], [label_list]
														
 
															+    else:
														
 
															+        return [text_list]
														
 
															+
														
 
															+
														
 
															+def get_best_padding_size(axis_len, min_len=3, max_len=300):
														
 
															+    # sizes = [8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
														
 
															+    #          128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224,
														
 
															+    #          232, 240, 248, 256, 264, 272, 280, 288, 296]
														
 
															+    # sizes = [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57,
														
 
															+    #          60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99, 102, 105, 108, 111,
														
 
															+    #          114, 117, 120, 123, 126, 129, 132, 135, 138, 141, 144, 147, 150, 153, 156,
														
 
															+    #          159, 162, 165, 168, 171, 174, 177, 180, 183, 186, 189, 192, 195, 198, 201,
														
 
															+    #          204, 207, 210, 213, 216, 219, 222, 225, 228, 231, 234, 237, 240, 243, 246,
														
 
															+    #          249, 252, 255, 258, 261, 264, 267, 270, 273, 276, 279, 282, 285, 288, 291,
														
 
															+    #          294, 297]
														
 
															+    sizes = []
														
 
															+    for i in range(1, max_len):
														
 
															+        if i * min_len <= max_len:
														
 
															+            sizes.append(i * min_len)
														
 
															+    if axis_len > sizes[-1]:
														
 
															+        return axis_len
														
 
															+    best_len = sizes[-1]
														
 
															+    for height in sizes:
														
 
															+        if axis_len <= height:
														
 
															+            best_len = height
														
 
															+            break
														
 
															+    # print("get_best_padding_size", axis_len, best_len)
														
 
															+    return best_len
														
 
															+
														
 
															+
														
 
															+def get_data_from_file(file_type, model_id=1):
														
 
															     if file_type == 'np':
														
 
															         data_path = 'train_data/data_3.npy'
														
 
															         data_label_path = 'train_data/data_label_3.npy'
														
@@ -215,17 +322,20 @@ def get_data_from_file(file_type):
 
															         array2 = np.load(data_label_path)
														
 
															         return array1, array2
														
 
															     elif file_type == 'txt':
														
 
															-        data_path = 'train_data/data3.txt'
														
 
															-        data_label_path = 'train_data/data_label3.txt'
														
 
															-
														
 
															+        if model_id == 1:
														
 
															+            data_path = 'train_data/data1.txt'
														
 
															+            data_label_path = 'train_data/data_label1.txt'
														
 
															+        elif model_id == 2:
														
 
															+            data_path = 'train_data/data2.txt'
														
 
															+            data_label_path = 'train_data/data_label2.txt'
														
 
															+        elif model_id == 3:
														
 
															+            data_path = 'train_data/data3.txt'
														
 
															+            data_label_path = 'train_data/data_label3.txt'
														
 
															         with open(data_path, 'r') as f:
														
 
															             data_list = f.readlines()
														
 
															         with open(data_label_path, 'r') as f:
														
 
															             data_label_list = f.readlines()
														
 
															-        # for i in range(len(data_list)):
														
 
															-        #     data_list[i] = eval(data_list[i][:-1])
														
 
															-        #     data_label_list[i] = eval(data_label_list[i][:-1])
														
 
															         return data_list, data_label_list
														
 
															     else:
														
 
															         print("file type error! only np and txt supported")
														
@@ -245,18 +355,19 @@ def processed_save_to_np():
 
															     #         f.write(str(line) + "\n")
														
 
															-def processed_save_to_txt():
														
 
															-    list1, list2 = get_data_from_sql()
														
 
															+def processed_save_to_txt(whole_table=False, padding=True):
														
 
															+    list1, list2 = get_data_from_sql(whole_table=whole_table, padding=padding)
														
 
															     # 打乱
														
 
															+    # if not whole_table or not padding:
														
 
															     zip_list = list(zip(list1, list2))
														
 
															     random.shuffle(zip_list)
														
 
															     list1[:], list2[:] = zip(*zip_list)
														
 
															-    with open('train_data/data3.txt', 'w') as f:
														
 
															+    with open('train_data/data1.txt', 'w') as f:
														
 
															         for line in list1:
														
 
															             f.write(str(line) + "\n")
														
 
															-    with open('train_data/data_label3.txt', 'w') as f:
														
 
															+    with open('train_data/data_label1.txt', 'w') as f:
														
 
															         for line in list2:
														
 
															             f.write(str(line) + "\n")
														
@@ -287,7 +398,7 @@ def my_data_loader(data_list, data_label_list, batch_size, is_train=True):
 
															     data_num = len(data_list)
														
 
															     # 定义Embedding输出
														
 
															-    output_shape = (6, 10, 60)
														
 
															+    output_shape = (6, 20, 60)
														
 
															     # batch循环取数据
														
 
															     i = 0
														
@@ -349,8 +460,109 @@ def my_data_loader(data_list, data_label_list, batch_size, is_train=True):
 
															                    'input_4': X[3], 'input_5': X[4], 'input_6': X[5], }
														
 
															+def my_data_loader_2(table_list, table_label_list, batch_size, is_train=True):
														
 
															+    pad_len = 0
														
 
															+
														
 
															+    table_num = len(table_list)
														
 
															+    if is_train and batch_size == 1:
														
 
															+        table_list, table_label_list = get_random(table_list, table_label_list)
														
 
															+
														
 
															+    # Embedding shape
														
 
															+    output_shape = (20, 60)
														
 
															+
														
 
															+    # batch循环取数据
														
 
															+    i = 0
														
 
															+    last_shape = None
														
 
															+    while True:
														
 
															+        new_table_list = []
														
 
															+        new_table_label_list = []
														
 
															+        for j in range(batch_size):
														
 
															+            if i >= table_num:
														
 
															+                i = 0
														
 
															+                if is_train:
														
 
															+                    table_list, table_label_list = get_random(table_list, table_label_list,
														
 
															+                                                              seed=random.randint(1, 40))
														
 
															+
														
 
															+            if type(table_list[i]) != list:
														
 
															+                table = eval(table_list[i][:-1])
														
 
															+            else:
														
 
															+                table = table_list[i]
														
 
															+
														
 
															+            if batch_size > 1:
														
 
															+                if last_shape is None:
														
 
															+                    last_shape = (len(table), len(table[0]))
														
 
															+                    continue
														
 
															+                if (len(table), len(table[0])) != last_shape:
														
 
															+                    last_shape = (len(table), len(table[0]))
														
 
															+                    break
														
 
															+
														
 
															+            if is_train:
														
 
															+                table_label = eval(table_label_list[i][:-1])
														
 
															+
														
 
															+            # 中文字符映射为Embedding
														
 
															+            for k in range(len(table)):
														
 
															+                table[k] = embedding_word_forward(table[k], (len(table[k]),
														
 
															+                                                     output_shape[0],
														
 
															+                                                     output_shape[1]))
														
 
															+            new_table_list.append(table)
														
 
															+            if is_train:
														
 
															+                new_table_label_list.append(table_label)
														
 
															+            i += 1
														
 
															+        new_table_list = np.array(new_table_list)
														
 
															+        X = new_table_list
														
 
															+        if X.shape[-2:] != output_shape:
														
 
															+            # print("Dimension not match!", X.shape)
														
 
															+            # print("\n")
														
 
															+            continue
														
 
															+
														
 
															+        # 获取Padding大小
														
 
															+        pad_height = get_best_padding_size(X.shape[1], pad_len)
														
 
															+        pad_width = get_best_padding_size(X.shape[2], pad_len)
														
 
															+        input_2 = np.zeros([1, X.shape[1], X.shape[2], pad_height, pad_width])
														
 
															+
														
 
															+        if is_train:
														
 
															+            new_table_label_list = np.array(new_table_label_list)
														
 
															+            Y = new_table_label_list
														
 
															+            # Y = Y.astype(np.float32)
														
 
															+            # yield {"input_1": X, "input_2": input_2}, \
														
 
															+            #       {"output_1": Y, "output_2": Y}
														
 
															+            yield {"input_1": X, "input_2": input_2}, \
														
 
															+                  {"output": Y}
														
 
															+        else:
														
 
															+            yield {"input_1": X, "input_2": input_2}
														
 
															+
														
 
															+
														
 
															+def check_train_data():
														
 
															+    data_list, label_list = get_data_from_file('txt', model_id=2)
														
 
															+    for data in data_list:
														
 
															+        data = eval(data)
														
 
															+        if len(data) % 8 != 0:
														
 
															+            print(len(data))
														
 
															+            print(len(data[0]))
														
 
															+        for row in data:
														
 
															+            if len(row) % 8 != 0:
														
 
															+                print(len(data))
														
 
															+                print(len(row))
														
 
															+
														
 
															+
														
 
															+def get_random(text_list, label_list, seed=42):
														
 
															+    random.seed(seed)
														
 
															+    zip_list = list(zip(text_list, label_list))
														
 
															+    random.shuffle(zip_list)
														
 
															+    text_list[:], label_list[:] = zip(*zip_list)
														
 
															+    return text_list, label_list
														
 
															+
														
 
															+
														
 
															 if __name__ == '__main__':
														
 
															-    processed_save_to_txt()
														
 
															+    processed_save_to_txt(whole_table=False, padding=False)
														
 
															     # data_balance()
														
 
															     # test_embedding()
														
 
															+    # check_train_data()
														
 
															+
														
 
															+    # _list = []
														
 
															+    # for i in range(1, 100):
														
 
															+    #     _list.append(i*3)
														
 
															+    # print(_list)
														
 
															+
														
 
															+    # print(get_best_padding_size(9, 5))
														
--- a/BiddingKG/dl/table_head/predict.py
+++ b/BiddingKG/dl/table_head/predict.py
--- a/BiddingKG/dl/table_head/preprocessing_test.py
+++ b/BiddingKG/dl/table_head/preprocessing_test.py
@@ -0,0 +1,17 @@
 
															+import codecs
														
 
															+import pandas as pd
														
 
															+from bs4 import BeautifulSoup
														
 
															+from BiddingKG.dl.interface.extract import predict
														
 
															+
														
 
															+
														
 
															+def test():
														
 
															+    df = pd.read_excel("has_table_no_attach.xlsx")
														
 
															+    for index, row in df.iterrows():
														
 
															+        if index % 100 == 0:
														
 
															+            print("Loop", index)
														
 
															+        text = row['dochtmlcon']
														
 
															+        predict(str(index), text)
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    test()
														
--- a/BiddingKG/dl/table_head/table_simplify.py
+++ b/BiddingKG/dl/table_head/table_simplify.py
@@ -0,0 +1,188 @@
 
															+#coding:utf-8
														
 
															+import json
														
 
															+import logging
														
 
															+
														
 
															+from BiddingKG.dl.table_head.pre_process import postgresql_util
														
 
															+
														
 
															+
														
 
															+user_score = {
														
 
															+    "test": 1.,
														
 
															+    "test1": 0.83,
														
 
															+    "test11": 0.82,
														
 
															+    "test12": 0.74,
														
 
															+    "test16": 0.83,
														
 
															+    "test17": 0.77,
														
 
															+    "test19": 0.79,
														
 
															+    "test20": 0.82,
														
 
															+    "test21": 0.73,
														
 
															+    "test22": 0.64,
														
 
															+    "test25": 0.77,
														
 
															+    "test26": 0.80,
														
 
															+    "test27": 0.72,
														
 
															+    "test29": 0.8,
														
 
															+    "test3": 0.,
														
 
															+    "test7": 0.82,
														
 
															+    "test8": 0.78,
														
 
															+    "test9": 0.80,
														
 
															+}
														
 
															+
														
 
															+
														
 
															+def get_labeled_table():
														
 
															+    sql = """
														
 
															+    select id, update_user, table_text, pre_label, post_label
														
 
															+    from label_table_head_info where status = 0
														
 
															+    """
														
 
															+
														
 
															+    result_list = postgresql_util(sql, limit=1000000)
														
 
															+    print("len(result_list)", len(result_list))
														
 
															+    with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
														
 
															+        not_eval_table_list = f.read()
														
 
															+    not_eval_table_list = eval(not_eval_table_list)
														
 
															+
														
 
															+    table_list = []
														
 
															+    # not_eval_table_list = []
														
 
															+    for table in result_list:
														
 
															+        pre_label = eval(table[3])
														
 
															+        post_label = eval(table[4])
														
 
															+        _id = table[0]
														
 
															+        update_user = table[1]
														
 
															+        table_text = table[2]
														
 
															+        if _id in not_eval_table_list:
														
 
															+            continue
														
 
															+
														
 
															+        try:
														
 
															+            if table_text[0] == '"':
														
 
															+                table_text = eval(table_text)
														
 
															+            else:
														
 
															+                table_text = table_text
														
 
															+            table_text = table_text.replace('\\', '/')
														
 
															+            table_text = eval(table_text)
														
 
															+        except:
														
 
															+            print("无法识别table_text", _id)
														
 
															+            not_eval_table_list.append(_id)
														
 
															+            continue
														
 
															+
														
 
															+        if post_label:
														
 
															+            label_list = post_label
														
 
															+        else:
														
 
															+            label_list = pre_label
														
 
															+
														
 
															+        table_list.append([table_text, label_list, update_user, _id])
														
 
															+    print("len(table_list)", len(table_list))
														
 
															+    # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "w") as f:
														
 
															+    #     f.write(str(not_eval_table_list))
														
 
															+    return table_list
														
 
															+
														
 
															+
														
 
															+def table_distance(table1, table2, thresh=0.85):
														
 
															+    # flatten
														
 
															+    table1 = [col for row in table1 for col in row]
														
 
															+    table2 = [col for row in table2 for col in row]
														
 
															+    while "" in table1:
														
 
															+        table1.remove("")
														
 
															+    while "" in table2:
														
 
															+        table2.remove("")
														
 
															+
														
 
															+    equal_cnt = 0
														
 
															+    not_equal_cnt = 0
														
 
															+    equal_flag = 0
														
 
															+    for col1 in table1:
														
 
															+        find_flag = 0
														
 
															+        for col2 in table2:
														
 
															+            if col1 == col2:
														
 
															+                equal_cnt += 1
														
 
															+                find_flag = 1
														
 
															+                break
														
 
															+        if not find_flag:
														
 
															+            not_equal_cnt += 1
														
 
															+        # print(equal_cnt, not_equal_cnt)
														
 
															+        if round(equal_cnt / max(len(table1), len(table2)), 2) >= thresh:
														
 
															+            # print("> thresh")
														
 
															+            equal_flag = 1
														
 
															+            break
														
 
															+        if round(not_equal_cnt / max(len(table1), len(table2)), 2) >= 1-thresh:
														
 
															+            # print("> 1-thresh")
														
 
															+            equal_flag = 0
														
 
															+            break
														
 
															+    return equal_flag
														
 
															+
														
 
															+
														
 
															+def remove_duplicate(table_list):
														
 
															+    logging.info("into remove_duplicate")
														
 
															+    table_list.sort(key=lambda x: x[0])
														
 
															+    delete_table_id_list = []
														
 
															+    for i in range(len(table_list)):
														
 
															+        delete_table_id_list = list(set(delete_table_id_list))
														
 
															+        if i % 1000 == 0:
														
 
															+            print("Loop", i, "len(delete_table_id_list)", len(delete_table_id_list))
														
 
															+            logging.info("*")
														
 
															+            with open(r"C:\Users\Administrator\Desktop\table_delete.txt", "w") as f:
														
 
															+                f.write(str(delete_table_id_list))
														
 
															+        table1 = table_list[i]
														
 
															+        if len(table1[0]) <= 2 and len(table1[0][0]) <= 2:
														
 
															+            delete_table_id_list.append(table1[3])
														
 
															+            continue
														
 
															+        for j in range(i+1, len(table_list)):
														
 
															+            table2 = table_list[j]
														
 
															+            if len(table2[0]) <= 2 and len(table2[0][0]) <= 2:
														
 
															+                delete_table_id_list.append(table2[3])
														
 
															+                continue
														
 
															+            # 行数相差2以上忽略
														
 
															+            if abs(len(table1[0]) - len(table2[0])) >= 2:
														
 
															+                continue
														
 
															+            # 列数相差2以上忽略
														
 
															+            if abs(len(table1[0][0])) - len(table2[0][0]) >= 2:
														
 
															+                continue
														
 
															+            if table_distance(table1[0], table2[0]):
														
 
															+                print("equal", table1[3], table2[3])
														
 
															+                score1 = user_score.get(table1[2])
														
 
															+                score2 = user_score.get(table2[2])
														
 
															+                if score1 is None:
														
 
															+                    score1 = 0.
														
 
															+                if score2 is None:
														
 
															+                    score2 = 0.
														
 
															+                if score1 >= score2:
														
 
															+                    delete_table_id_list.append(table2[3])
														
 
															+                else:
														
 
															+                    delete_table_id_list.append(table1[3])
														
 
															+
														
 
															+    delete_table_id_list = list(set(delete_table_id_list))
														
 
															+    new_table_list = []
														
 
															+    for table in table_list:
														
 
															+        if table[3] not in delete_table_id_list:
														
 
															+            new_table_list.append(table)
														
 
															+    return new_table_list
														
 
															+
														
 
															+
														
 
															+def eval_table(_str):
														
 
															+    try:
														
 
															+        if _str[0] == '"':
														
 
															+            table_text = eval(_str)
														
 
															+        else:
														
 
															+            table_text = _str
														
 
															+        table_text = table_text.replace('\\', '/')
														
 
															+        table_text = eval(table_text)
														
 
															+    except:
														
 
															+        print("无法识别table_text")
														
 
															+        table_text = ""
														
 
															+    return table_text
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    _list = get_labeled_table()
														
 
															+    _list = remove_duplicate(_list)
														
 
															+    _str = json.dumps(str(_list))
														
 
															+    with open(r"C:\Users\Administrator\Desktop\table_simplify.txt", "w") as f:
														
 
															+        f.write(_str)
														
 
															+
														
 
															+    # _str1 = "[['', '', 'Yes']]"
														
 
															+    # _str2 = "[['', '', 'Yes', '']]"
														
 
															+    # table1 = eval_table(_str1)
														
 
															+    # table2 = eval_table(_str2)
														
 
															+    #
														
 
															+    # print(table_distance(table1, table2))
														
 
															+
														
 
															+    # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
														
 
															+    #     not_eval_table_list = f.read()
														
 
															+    # print(not_eval_table_list)
														
 
															+    # not_eval_table_list = eval(not_eval_table_list)
														
--- a/BiddingKG/dl/table_head/train.py
+++ b/BiddingKG/dl/table_head/train.py
@@ -2,24 +2,40 @@ import sys
 
															 import os
														
 
															 sys.path.append(os.path.abspath("../../.."))
														
 
															 os.environ['KERAS_BACKEND'] = 'tensorflow'
														
 
															-from keras.metrics import categorical_accuracy
														
 
															+from BiddingKG.dl.table_head.models.layer_utils import MyModelCheckpoint
														
 
															 from BiddingKG.dl.table_head.metrics import precision, recall, f1
														
 
															 from keras import optimizers, Model
														
 
															 from BiddingKG.dl.table_head.models.model import get_model
														
 
															 from BiddingKG.dl.table_head.loss import focal_loss
														
 
															 from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
														
 
															-from BiddingKG.dl.table_head.pre_process import get_data_from_file, get_data_from_sql, my_data_loader
														
 
															+from BiddingKG.dl.table_head.pre_process import get_data_from_file, get_data_from_sql, my_data_loader, my_data_loader_2, \
														
 
															+    get_random
														
 
															 from keras import backend as K
														
 
															-
														
 
															-input_shape = (6, 10, 60)
														
 
															-output_shape = (1,)
														
 
															-batch_size = 32
														
 
															-epochs = 1000
														
 
															-pretrained_path = "checkpoints/best.hdf5"
														
 
															-checkpoint_path = "checkpoints/"
														
 
															-PRETRAINED = True
														
 
															-CHECKPOINT = False
														
 
															+model_id = 1
														
 
															+
														
 
															+if model_id == 1:
														
 
															+    input_shape = (6, 20, 60)
														
 
															+    output_shape = (1,)
														
 
															+    batch_size = 128
														
 
															+    epochs = 1000
														
 
															+    PRETRAINED = True
														
 
															+    CHECKPOINT = False
														
 
															+    # 用GPU
														
 
															+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
														
 
															+else:
														
 
															+    input_shape = (None, None, 20, 60)
														
 
															+    output_shape = (None, None)
														
 
															+    batch_size = 1
														
 
															+    epochs = 1000
														
 
															+    PRETRAINED = False
														
 
															+    CHECKPOINT = False
														
 
															+    # 用CPU
														
 
															+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
														
 
															+    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
														
 
															+
														
 
															+pretrained_path = "checkpoints/" + str(model_id) + "/best.hdf5"
														
 
															+checkpoint_path = "checkpoints/" + str(model_id) + "/"
														
 
															 def train():
														
@@ -27,22 +43,31 @@ def train():
 
															     print("gpus", K.tensorflow_backend._get_available_gpus())
														
 
															     # Data
														
 
															-    data_x, data_y = get_data_from_file('txt')
														
 
															-    # data_x = data_x[:60000]
														
 
															-    # data_y = data_y[:60000]
														
 
															+    data_x, data_y = get_data_from_file('txt', model_id=model_id)
														
 
															     print("finish read data", len(data_x))
														
 
															     # Split -> Train, Test
														
 
															-    split_size = int(len(data_x)*0.1)
														
 
															-    test_x, test_y = data_x[:split_size], data_y[:split_size]
														
 
															-    train_x, train_y = data_x[split_size:], data_y[split_size:]
														
 
															+    if model_id == 1:
														
 
															+        split_size = int(len(data_x)*0.1)
														
 
															+        test_x, test_y = data_x[:split_size], data_y[:split_size]
														
 
															+        train_x, train_y = data_x[split_size:], data_y[split_size:]
														
 
															+    else:
														
 
															+        data_x, data_y = get_random(data_x, data_y)
														
 
															+        split_size = int(len(data_x)*0.1)
														
 
															+        test_x, test_y = data_x[:split_size], data_y[:split_size]
														
 
															+        train_x, train_y = data_x[split_size:], data_y[split_size:]
														
 
															+    print("len(train_x), len(test_x)", len(train_x), len(test_x))
														
 
															     # Data Loader
														
 
															-    train_data_loader = my_data_loader(train_x, train_y, batch_size=batch_size)
														
 
															-    test_data_loader = my_data_loader(test_x, test_y, batch_size=batch_size)
														
 
															+    if model_id == 1:
														
 
															+        train_data_loader = my_data_loader(train_x, train_y, batch_size=batch_size)
														
 
															+        test_data_loader = my_data_loader(test_x, test_y, batch_size=batch_size)
														
 
															+    else:
														
 
															+        train_data_loader = my_data_loader_2(train_x, train_y, batch_size=batch_size)
														
 
															+        test_data_loader = my_data_loader_2(test_x, test_y, batch_size=1)
														
 
															     # Model
														
 
															-    model = get_model(input_shape, output_shape)
														
 
															+    model = get_model(input_shape, output_shape, model_id=model_id)
														
 
															     if PRETRAINED:
														
 
															         model.load_weights(pretrained_path)
														
 
															         print("read pretrained model", pretrained_path)
														
@@ -54,16 +79,20 @@ def train():
 
															     else:
														
 
															         print("no checkpoint")
														
 
															-    filepath = 'e{epoch:02d}-f1{val_f1:.2f}'
														
 
															-    checkpoint = ModelCheckpoint(checkpoint_path+filepath+".hdf5", monitor='val_f1',
														
 
															-                                 verbose=1, save_best_only=True, mode='max')
														
 
															+    filepath = 'e-{epoch:02d}_f1-{val_f1:.2f}'
														
 
															+    # filepath = 'e-{epoch:02d}_acc-{val_loss:.2f}'
														
 
															+    checkpoint = ModelCheckpoint(checkpoint_path+filepath+".hdf5",
														
 
															+                                 monitor='val_f1',
														
 
															+                                 verbose=1,
														
 
															+                                 save_best_only=True,
														
 
															+                                 mode='max')
														
 
															-    model.compile(optimizer=optimizers.Adam(lr=0.005), loss=focal_loss(),
														
 
															-    # model.compile(optimizer=optimizers.Adam(lr=0.005), loss='binary_crossentropy',
														
 
															-                  metrics=['acc',
														
 
															-                           precision, recall, f1])
														
 
															+    model.compile(optimizer=optimizers.Adam(lr=0.0005),
														
 
															+                  loss={"output": focal_loss(3., 0.5)},
														
 
															+                  # loss_weights={"output": 0.5},
														
 
															+                  metrics=['acc', precision, recall, f1])
														
 
															-    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.1, patience=5,
														
 
															+    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.1, patience=10,
														
 
															                             verbose=1, mode='max', cooldown=0, min_lr=0)
														
 
															     model.fit_generator(train_data_loader,
														
@@ -73,11 +102,6 @@ def train():
 
															                         validation_steps=max(1, len(test_x) // batch_size),
														
 
															                         epochs=epochs)
														
 
															-    # model.fit(x=[train_x[0], train_x[1], train_x[2]], y=train_y,
														
 
															-    #           validation_data=([test_x[0], test_x[1], test_x[2]], test_y),
														
 
															-    #           epochs=epochs, batch_size=256, shuffle=True,
														
 
															-    #           callbacks=[checkpoint, rlu])
														
 
															-
														
 
															     return model, test_x
														
--- a/BiddingKG/dl/table_head/vocab_word.pk
+++ b/BiddingKG/dl/table_head/vocab_word.pk
--- a/BiddingKG/dl/test/test4.py
+++ b/BiddingKG/dl/test/test4.py
@@ -46,7 +46,7 @@ def test(name,content):
 
															 if __name__=="__main__":
														
 
															     # filename = "比地_52_79929693.html"
														
 
															     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
														
 
															-    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\test12354.txt","r",encoding="utf8").read()
														
 
															+    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
														
 
															     content = str(BeautifulSoup(text).find("div",id="pcontent"))
														
 
															     # df_a = {"html":[]}
														
 
															     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
														
@@ -75,7 +75,8 @@ if __name__=="__main__":
 
															     # '''
														
 
															     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
														
 
															     # print(predict("12", content,"打印机"))
														
 
															-    print(predict("12", text,"打印机"))
														
 
															+    # content = codecs.open("D:\\Project\\format_conversion_maxcompute\\result.html", "r",encoding="utf8").read()
														
 
															+    print(predict("12", content,"打印机"))
														
 
															     # test(12,content)
														
 
															     # test(12,text)
														
 
															     print("takes",time.time()-_time1)