Przeglądaj źródła

Merge remote-tracking branch 'origin/master'

znj 3 lat temu
rodzic
commit
503218e064

+ 5 - 0
.gitignore

@@ -10,3 +10,8 @@
 /BiddingKG/dl/channel/data/
 /BiddingKG/dl/test
 node_modules
+/BiddingKG/dl/table_head/train_data/
+/BiddingKG/dl/table_head/check_user_result/
+/BiddingKG/dl/table_head/checkpoints/
+/BiddingKG/dl/table_head/data_new.csv
+/BiddingKG/dl/table_head/has_table_no_attach.xlsx

+ 1 - 0
.idea/compiler.xml

@@ -1,6 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="CompilerConfiguration">
+    <option name="BUILD_PROCESS_HEAP_SIZE" value="11000" />
     <bytecodeTargetLevel>
       <module name="BiddingKG" target="8" />
     </bytecodeTargetLevel>

+ 8 - 0
.idea/inspectionProfiles/Project_Default.xml

@@ -2,5 +2,13 @@
   <profile version="1.0">
     <option name="myName" value="Project Default" />
     <inspection_tool class="DuplicatedCode" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyInterpreterInspection" enabled="false" level="WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredIdentifiers">
+        <list>
+          <option value="tensorflow.nn.bidirectional_dynamic_rnn" />
+        </list>
+      </option>
+    </inspection_tool>
   </profile>
 </component>

+ 1 - 1
.idea/misc.xml

@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_15" project-jdk-name="Python 3.5 (BiddingKG)" project-jdk-type="Python SDK" />
   <component name="PythonCompatibilityInspectionAdvertiser">
     <option name="version" value="3" />
   </component>

+ 2 - 2
BiddingKG.iml

@@ -2,13 +2,13 @@
 <module type="JAVA_MODULE" version="4">
   <component name="FacetManager">
     <facet type="Python" name="Python">
-      <configuration sdkName="Python 3.5 (dl_nlp)" />
+      <configuration sdkName="Python 3.5 (BiddingKG)" />
     </facet>
   </component>
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
     <orderEntry type="jdk" jdkName="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
-    <orderEntry type="library" exported="" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
+    <orderEntry type="library" name="Python 3.5 (BiddingKG) interpreter library" level="application" />
   </component>
 </module>

+ 29 - 0
BiddingKG/dl/common/Utils.py

@@ -686,6 +686,35 @@ def embedding_word(datas,shape):
         out_index += 1
     return embed
 
+
+def embedding_word_forward(datas,shape):
+    '''
+    @summary:查找词汇对应的词向量
+    @param:
+        datas:词汇的list
+        shape:结果的shape
+    @return: array,返回对应shape的词嵌入
+    '''
+    model_w2v = getModel_word()
+    embed = np.zeros(shape)
+    length = shape[1]
+    out_index = 0
+    #print(datas)
+    for data in datas:
+        index = 0
+        for item in str(data)[:shape[1]]:
+            if index>=length:
+                break
+            if item in model_w2v.vocab:
+                embed[out_index][index] = model_w2v[item]
+                index += 1
+            else:
+                # embed[out_index][index] = model_w2v['unk']
+                index += 1
+        out_index += 1
+    return embed
+
+
 def formEncoding(text,shape=(100,60),expand=False):
     embedding = np.zeros(shape)
     word_model = getModel_word()

+ 25 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -8,6 +8,7 @@ import time
 import codecs
 
 from BiddingKG.dl.ratio.re_ratio import extract_ratio
+from BiddingKG.dl.table_head.predict import predict
 
 sys.setrecursionlimit(1000000)
 sys.path.append(os.path.abspath("../.."))
@@ -414,6 +415,28 @@ def tableToText(soup):
         
         return inner_table,head_list
 
+    def set_head_model(inner_table):
+        for i in range(len(inner_table)):
+            for j in range(len(inner_table[i])):
+                inner_table[i][j] = inner_table[i][j][0]
+
+        # 模型预测表头
+        predict_list = predict(inner_table)
+        with open(r"C:\Users\Administrator\Desktop\table_head_test.txt", "a") as f:
+            for i in range(len(predict_list)):
+                f.write(str(i) + " " + str(inner_table[i]) + "\n")
+                f.write(str(i) + " " + str(predict_list[i]) + "\n")
+            f.write("\n")
+
+        # print("table_list", inner_table)
+        # print("predict_list", predict_list)
+
+        for i in range(len(inner_table)):
+            for j in range(len(inner_table[i])):
+                inner_table[i][j] = [inner_table[i][j], int(predict_list[i][j])]
+        head_list = sliceTable(inner_table)
+        return inner_table, head_list
+
     def setHead_incontext(inner_table,pat_head,fix_value="~~",prob_min=0.5):
 
         data_x,data_position = getPredictor("form").getModel("context").encode(inner_table)
@@ -969,7 +992,8 @@ def tableToText(soup):
         if len(inner_table)>0 and len(inner_table[0])>0:
             #inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
             #inner_table,head_list = setHead_inline(inner_table)
-            inner_table,head_list = setHead_initem(inner_table,pat_head)
+            # inner_table, head_list = setHead_initem(inner_table,pat_head)
+            inner_table, head_list = set_head_model(inner_table)
             # inner_table,head_list = setHead_incontext(inner_table,pat_head)
             # print(inner_table)
             # for begin in range(len(head_list[:-1])):

BIN
BiddingKG/dl/table_head/checkpoints/best.hdf5 → BiddingKG/dl/table_head/best.hdf5


+ 62 - 17
BiddingKG/dl/table_head/check_user_label_accuracy.py

@@ -8,15 +8,16 @@ def user_label_accuracy(update_user):
         sql = """
         select table_text, pre_label, post_label, id
         from label_table_head_info 
-        where update_user='""" + update_user + "' order by id desc limit 3000"
+        where update_user='""" + update_user + "' order by update_time"
     else:
         sql = """
         select table_text, pre_label, post_label, id
         from label_table_head_info 
-        where update_user='""" + update_user + "' and status = 1 and update_time >= '2022-01-17'"
+        where update_user='""" + update_user + "' and status = 1 and update_time >= '2022-01-23'"
 
     result_list = postgresql_util(sql, limit=1000000)
     right_cnt = 0
+    error_cnt = 0
     error_id_list = []
     right_id_list = []
     i = 0
@@ -24,7 +25,10 @@ def user_label_accuracy(update_user):
     for table in result_list:
         i += 1
         if i % 1000 == 0:
-            print("Loop", i, right_cnt, time.time()-start_time)
+            if right_cnt + error_cnt != 0:
+                print("Loop", i, right_cnt/(right_cnt+error_cnt), time.time()-start_time)
+            else:
+                print("Loop", i, time.time()-start_time)
             start_time = time.time()
 
         pre_label = eval(table[1])
@@ -49,27 +53,44 @@ def user_label_accuracy(update_user):
         else:
             label_list = pre_label
 
-        predict_label_list = predict(table_text)
+        predict_label_list = predict(table_text, model_id=3)
         if predict_label_list:
             if str(label_list) == str(predict_label_list):
-                right_cnt += 1
                 right_id_list.append(str(_id)+"\n")
+                # right_cnt += 1
             else:
-                # cnt = 0
-                # for j in range(len(label_list)):
-                #     row1 = label_list[j]
-                #     row2 = predict_label_list[j]
-                #     if str(row1) != str(row2):
-                #         cnt += 1
-                #     if cnt >= 2:
-                #         error_id_list.append(str(_id)+"\n")
-                #         break
                 error_id_list.append(str(_id)+"\n")
+                # error_cnt += 1
+            if len(label_list) == len(predict_label_list):
+                for j in range(len(label_list)):
+                    for k in range(len(label_list[j])):
+                        if table_text[j][k] == "":
+                            continue
+                        if label_list[j][k] == "1" or predict_label_list[j][k] == "1":
+                            if len(table_text[j][k]) >= 20:
+                                continue
+                            else:
+                                if label_list[j][k] == predict_label_list[j][k]:
+                                    right_cnt += 1
+                                else:
+                                    error_cnt += 1
+            else:
+                print("len(label_list) == len(predict_label_list)", _id,
+                      len(label_list), len(predict_label_list))
 
-    accuracy = right_cnt / len(result_list)
+    accuracy = right_cnt / (right_cnt + error_cnt)
     print(update_user + " accuracy:", accuracy, 'total:', len(result_list))
     print("error_id_list", len(error_id_list))
 
+    save_path = "check_user_result/accuracy.txt"
+    with open(save_path, 'a') as f:
+        f.write(update_user + " "
+                + "表头正确率-" + str(round(accuracy, 2)) + " "
+                + "文章数-" + str(len(result_list)) + " "
+                + "表格总数-" + str(right_cnt + error_cnt) + " "
+                + "表头正确数-" + str(right_cnt)
+                + "\n")
+
     save_path = "check_user_result/"+update_user+"_error.txt"
     with open(save_path, 'w') as f:
         f.writelines(error_id_list)
@@ -101,9 +122,33 @@ def get_single_result(_id):
 
 
 if __name__ == '__main__':
-    # users = ["test9", "test11", "test12", "test20", "test25", "test26", "test27"]
-    users = ["test20", "test27"]
+    # users = ["test9", "test11", "test12", "test25", "test26"]
+    # users = ["test9", "test11", ]
+    # users = ['test12', 'test25']
+    # users = ["test20", "test27"]
     # users = ['test']
+    users = [
+        "test1",
+        "test11",
+        "test12",
+        "test16",
+        "test17",
+        "test19",
+        "test20",
+        "test21",
+        "test22",
+        "test25",
+        "test26",
+        "test27",
+        "test29",
+        "test3",
+        "test7",
+        "test8",
+        "test9",
+    ]
+    users = ["test"]
+    users = ["test12", "test17", "test21", "test22", "test27", ]
+    users = ["test27"]
     acc_list = []
     for user in users:
         acc = user_label_accuracy(user)

BIN
BiddingKG/dl/table_head/checkpoints/binary_loss/best.hdf5


BIN
BiddingKG/dl/table_head/checkpoints/focal_loss/best.hdf5


+ 7 - 1
BiddingKG/dl/table_head/loss.py

@@ -15,4 +15,10 @@ def focal_loss(gamma=2., alpha=.5):
                                * K.backend.log(K.backend.epsilon()+pt_1))\
                - K.backend.sum((1-alpha) * K.backend.pow(pt_0, gamma)
                                * K.backend.log(1. - pt_0 + K.backend.epsilon()))
-    return f_loss
+    return f_loss
+
+
+def union_loss(gamma=2., alpha=.5):
+    def _loss(y_true, y_pred):
+
+        return focal_loss(gamma, alpha)

+ 272 - 0
BiddingKG/dl/table_head/models/layer_utils.py

@@ -0,0 +1,272 @@
+import os
+import sys
+import tensorflow as tf
+from keras.callbacks import Callback
+from keras.layers import Layer, warnings
+import numpy as np
+sys.path.append(os.path.dirname(__file__))
+from pre_process import get_best_padding_size
+
+
+class BatchReshape1(Layer):
+    """
+    将表格的行列维度合并到Batch维度中
+    (batch, rows, cols, character_num, character_embed) -> (batch*rows*cols, character_num, character_embed)
+    """
+
+    def __init__(self, character_num, character_embed):
+        super(BatchReshape1, self).__init__()
+        self.character_num = character_num
+        self.character_embed = character_embed
+
+    def call(self, inputs, mask=None, **kwargs):
+        batch = tf.shape(inputs)[0]
+        height = tf.shape(inputs)[1]
+        width = tf.shape(inputs)[2]
+
+        outputs = tf.reshape(inputs, (batch*height*width,
+                                      self.character_num, self.character_embed))
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, self.character_num, self.character_embed
+
+
+class BatchReshape2(Layer):
+    """
+    将Batch维度中的行列拆分出来
+    (batch*rows*cols, cell_embed) -> (batch, rows, cols, cell_embed)
+    """
+
+    def __init__(self, cell_embed):
+        super(BatchReshape2, self).__init__()
+        self.cell_embed = cell_embed
+
+    def call(self, inputs, mask=None, **kwargs):
+        input1 = inputs[0]
+        input2 = inputs[1]
+
+        batch = tf.shape(input1)[0]
+        height = tf.shape(input1)[1]
+        width = tf.shape(input1)[2]
+
+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, None, self.cell_embed
+
+
+class BatchReshape3(Layer):
+    """
+    将表格的行维度合并到Batch维度中
+    (batch, rows, cols, cell_embed) -> (batch*rows, cols, cell_embed)
+    """
+
+    def __init__(self, cell_embed):
+        super(BatchReshape3, self).__init__()
+        self.cell_embed = cell_embed
+
+    def call(self, inputs, mask=None, **kwargs):
+        batch = tf.shape(inputs)[0]
+        height = tf.shape(inputs)[1]
+        width = tf.shape(inputs)[2]
+
+        outputs = tf.reshape(inputs, (batch*height, width, self.cell_embed))
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, self.cell_embed
+
+
+class BatchReshape4(Layer):
+    """
+    将Batch维度中的行拆出来
+    (batch*rows, cols, cell_embed) -> (batch, rows, cols, cell_embed)
+    """
+
+    def __init__(self, cell_embed):
+        super(BatchReshape4, self).__init__()
+        self.supports_masking = True
+        self.cell_embed = cell_embed
+
+    def compute_mask(self, inputs, mask=None):
+        print(mask)
+        # if mask[0] is None:
+        #     return mask
+
+        # input1 = inputs[0]
+        # input2 = inputs[1]
+        # batch = tf.shape(input1)[0]
+        # height = tf.shape(input1)[1]
+        # width = tf.shape(input1)[2]
+        #
+        # mask_tensor = tf.reshape(mask[1], (batch, height, width, self.cell_embed))
+        return mask
+
+    def call(self, inputs, mask=None, **kwargs):
+        input1 = inputs[0]
+        input2 = inputs[1]
+
+        batch = tf.shape(input1)[0]
+        height = tf.shape(input1)[1]
+        width = tf.shape(input1)[2]
+
+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, None, self.cell_embed
+
+
+class BatchReshape5(Layer):
+    """
+    将表格的行维度合并到Batch维度中
+    (batch, rows, cols, cell_embed) -> (batch, rows*cols, cell_embed)
+    """
+
+    def __init__(self, cell_embed):
+        super(BatchReshape5, self).__init__()
+        self.cell_embed = cell_embed
+
+    def call(self, inputs, mask=None, **kwargs):
+        batch = tf.shape(inputs)[0]
+        height = tf.shape(inputs)[1]
+        width = tf.shape(inputs)[2]
+
+        outputs = tf.reshape(inputs, (batch, height*width, self.cell_embed))
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, self.cell_embed
+
+
+class BatchReshape6(Layer):
+    """
+    将Batch维度中的行拆出来
+    (batch, rows*cols, cell_embed) -> (batch, rows, cols, cell_embed)
+    """
+
+    def __init__(self, cell_embed):
+        super(BatchReshape6, self).__init__()
+        self.cell_embed = cell_embed
+
+    def call(self, inputs, mask=None, **kwargs):
+        input1 = inputs[0]
+        input2 = inputs[1]
+
+        batch = tf.shape(input1)[0]
+        height = tf.shape(input1)[1]
+        width = tf.shape(input1)[2]
+
+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, None, self.cell_embed
+
+
+class MyPadding(Layer):
+    def __init__(self, pad_height, pad_width, cell_embed):
+        super(MyPadding, self).__init__()
+        self.pad_height = pad_height
+        self.pad_width = pad_width
+        self.cell_embed = cell_embed
+
+    def call(self, inputs, mask=None, **kwargs):
+        batch = tf.shape(inputs)[0]
+        height = tf.shape(inputs)[1]
+        width = tf.shape(inputs)[2]
+
+        outputs = tf.pad(inputs, [[0, 0],
+                                  [0, self.pad_height - height],
+                                  [0, self.pad_width - width],
+                                  [0, 0]])
+
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, None, self.cell_embed
+
+
+class MySplit(Layer):
+    def __init__(self, height, width, **kwargs):
+        super(MySplit, self).__init__(**kwargs)
+        self.height = height
+        self.width = width
+
+    def call(self, inputs, mask=None, **kwargs):
+        outputs = inputs[:, 0:self.height, 0:self.width]
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, None
+
+
+class MyModelCheckpoint(Callback):
+    def __init__(self, filepath, monitor='val_loss', verbose=0,
+                 save_best_only=False, save_weights_only=False,
+                 mode='auto', period=1):
+        super(MyModelCheckpoint, self).__init__()
+        self.monitor = monitor
+        self.verbose = verbose
+        self.filepath = filepath
+        self.save_best_only = save_best_only
+        self.save_weights_only = save_weights_only
+        self.period = period
+        self.epochs_since_last_save = 0
+
+        if mode not in ['auto', 'min', 'max']:
+            warnings.warn('ModelCheckpoint mode %s is unknown, '
+                          'fallback to auto mode.' % (mode),
+                          RuntimeWarning)
+            mode = 'auto'
+
+        if mode == 'min':
+            self.monitor_op = np.less
+            self.best = np.Inf
+        elif mode == 'max':
+            self.monitor_op = np.greater
+            self.best = -np.Inf
+        else:
+            if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):
+                self.monitor_op = np.greater
+                self.best = -np.Inf
+            else:
+                self.monitor_op = np.less
+                self.best = np.Inf
+
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        self.epochs_since_last_save += 1
+        if self.epochs_since_last_save >= self.period:
+            self.epochs_since_last_save = 0
+            filepath = self.filepath.format(epoch=epoch + 1, **logs)
+            if self.save_best_only:
+                current = (logs.get(self.monitor[0]) + logs.get(self.monitor[1])) / 2
+                if current is None:
+                    warnings.warn('Can save best model only with %s available, '
+                                  'skipping.' % (self.monitor), RuntimeWarning)
+                else:
+                    if self.monitor_op(current, self.best):
+                        if self.verbose > 0:
+                            print('\nEpoch %05d: %s improved from %0.5f to %0.5f,'
+                                  ' saving model to %s'
+                                  % (epoch + 1, self.monitor, self.best,
+                                     current, filepath))
+                        self.best = current
+                        if self.save_weights_only:
+                            self.model.save_weights(filepath, overwrite=True)
+                        else:
+                            self.model.save(filepath, overwrite=True)
+                    else:
+                        if self.verbose > 0:
+                            print('\nEpoch %05d: %s did not improve from %0.5f' %
+                                  (epoch + 1, self.monitor, self.best))
+            else:
+                if self.verbose > 0:
+                    print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
+                if self.save_weights_only:
+                    self.model.save_weights(filepath, overwrite=True)
+                else:
+                    self.model.save(filepath, overwrite=True)

+ 232 - 0
BiddingKG/dl/table_head/models/loop_lstm.py

@@ -0,0 +1,232 @@
+import keras
+import tensorflow as tf
+from keras import models, backend as K
+from keras.layers import Layer, Input, Lambda, Concatenate, Dense, LSTM, Bidirectional
+from tensorflow.contrib.rnn import LSTMCell
+import numpy as np
+
+from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
+from BiddingKG.dl.table_head.models.u_net import u_net_small
+
+
+def attention(inputs, w_omega, b_omega, u_omega, time_major=False):
+    if isinstance(inputs, tuple):
+        inputs = tf.concat(inputs, 2)
+    if time_major:  # (B,T,D) => (T,B,D)
+        inputs = tf.transpose(inputs, [1, 0, 2])
+    v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)
+
+    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
+    alphas = tf.nn.softmax(vu, name='alphas')  # (B,T) shape
+    # the result has (B,D) shape
+    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
+
+    return output, alphas
+
+
+class LoopCell(Layer):
+    def __init__(self, hidden_size, attention_size, character_num, character_embed,
+                 cell_embed):
+        super(LoopCell, self).__init__()
+
+        # Hyper parameters
+        self.hidden_size = hidden_size
+        self.attention_size = attention_size
+        self.character_num = character_num
+        self.character_embed = character_embed
+        self.cell_embed = cell_embed
+
+    def build(self, batch_input_shape):
+        super(LoopCell, self).build(batch_input_shape)
+
+        # Trainable parameters
+        # Attention
+        # self.w_omega = self.add_weight("w_omega", shape=[self.hidden_size*2, self.attention_size],
+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
+        #                                trainable=True)
+        # self.b_omega = self.add_weight("b_omega", shape=[self.attention_size],
+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
+        #                                trainable=True)
+        # self.u_omega = self.add_weight("u_omega", shape=[self.attention_size],
+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
+        #                                trainable=True)
+
+        # Bi-LSTM
+        # self.forward_cell = LSTMCell(self.hidden_size, forget_bias=1.0, state_is_tuple=True)
+        # self.backward_cell = LSTMCell(self.hidden_size, forget_bias=1.0, state_is_tuple=True)
+        # self.bi_lism = Bidirectional(LSTM(self.hidden_size, return_sequences=True))
+        # self.bi_lism.build(input_shape=(None, self.character_num, self.character_embed))
+        # self.trainable_weights += self.bi_lism.trainable_weights
+        #
+        # self.self_attention = SeqSelfAttention(attention_activation='sigmoid')
+        # self.self_attention.build(input_shape=(None, self.character_num, 2*self.hidden_size))
+        # self.trainable_weights += self.self_attention.trainable_weights
+        # print(self.trainable_weights)
+
+        # DNN
+        # self.w1 = self.add_weight('W1', [2*self.attention_size, self.cell_embed],
+        #                           initializer=tf.random_uniform_initializer(-0.25, 0.25),
+        #                           trainable=True)
+        #
+        # self.b1 = self.add_weight('b1', [self.cell_embed],
+        #                           initializer=tf.zeros_initializer(),
+        #                           trainable=True)
+        # self.dense = Dense(self.cell_embed, activation="relu")
+        # print(batch_input_shape[0], batch_input_shape[1], batch_input_shape[2])
+        # self.dense.build(input_shape=(batch_input_shape[0]*batch_input_shape[1]*batch_input_shape[2],
+        #                               2*self.attention_size))
+        # self.trainable_weights += self.dense.trainable_weights
+
+    def call(self, inputs, mask=None, **kwargs):
+        def fn(x):
+            print("fn_0", x)
+
+            # (batch*height*width, character_num, hidden_size)
+            # outputs, last_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=self.forward_cell,
+            #                                                        cell_bw=self.backward_cell,
+            #                                                        inputs=x,
+            #                                                        dtype=tf.float32,
+            #                                                        time_major=False)
+            # (batch*height*width, character_num, 2*hidden_size)
+            # outputs = self.bi_lism(x)
+            # print("fn_1", outputs)
+
+            # (batch*height*width, character_num, 2*hidden_size)
+            # outputs = self.self_attention(outputs)
+            # print("fn_2", outputs)
+
+            # (batch*height*width, 2*hidden_size)
+            # outputs, _ = attention(outputs, self.w_omega, self.b_omega,
+            #                        self.u_omega, time_major=False)
+
+
+            # (batch*height*width, cell_embedding)
+            # outputs = tf.nn.xw_plus_b(outputs, self.w1, self.b1)
+            # outputs = self.dense(outputs)
+            # print("fn_3", outputs)
+            return outputs
+
+        batch = tf.shape(inputs)[0]
+        height = tf.shape(inputs)[1]
+        width = tf.shape(inputs)[2]
+
+        # (batch, height*width, character_num(time_step), character_embedding)
+        # inputs = tf.reshape(inputs, (tf.shape(inputs)[0],
+        #                              height*width,
+        #                              inputs.shape[3], inputs.shape[4]))
+
+        # (batch*height*width, character_num, character_embedding)
+        outputs = tf.reshape(inputs, (batch*height*width,
+                                      inputs.shape[3], inputs.shape[4]))
+
+        # (height*width, batch, character_num(time_step), character_embedding)
+        # inputs = tf.transpose(inputs, (1, 0, 2, 3))
+
+        # split height*width, each cell
+        # (height*width, batch, cell_embedding)
+        # outputs = tf.map_fn(fn=lambda x: fn(x), elems=inputs, dtype=tf.float32)
+        # print("loop_lstm_1", outputs)
+        # outputs = tf.squeeze(outputs, 0)
+
+        # (batch*height*width, 2*attention_size)
+        # outputs = fn(inputs)
+        # print("loop_lstm_2", outputs)
+
+        # (1, batch*height*width, 2*attention_size)
+        # outputs = tf.expand_dims(outputs, 0)
+        # print("loop_lstm_3", outputs)
+
+        # (batch*height*width, cell_embedding)
+        # outputs = Dense(self.cell_embed, activation="relu")(outputs)
+        # print("loop_lstm_3", outputs)
+
+        # (batch, height*width, cell_embedding)
+        # outputs = tf.transpose(outputs, (1, 0, 2))
+        # print("loop_lstm_2", outputs)
+
+        # (batch, height, width, cell_embedding)
+        # outputs = tf.reshape(outputs, (batch, height, width, self.cell_embed))
+        # print("loop_lstm_4", outputs)
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, self.character_num, self.character_embed
+
+
+class BatchReshape(Layer):
+    def __init__(self, cell_embed):
+        super(BatchReshape, self).__init__()
+        self.cell_embed = cell_embed
+
+    def call(self, inputs, mask=None, **kwargs):
+        input1 = inputs[0]
+        input2 = inputs[1]
+
+        batch = tf.shape(input1)[0]
+        height = tf.shape(input1)[1]
+        width = tf.shape(input1)[2]
+
+        # (batch, height, width, cell_embedding)
+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
+        print("batch_reshape", outputs)
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, None, self.cell_embed
+
+
+# def batch_reshape(x):
+#     return K.reshape(x, (batch, height, width, cell_embed))
+
+
+if __name__ == '__main__':
+    input_shape = (16, 8, 10, 60)
+    hidden_size = 64
+    attention_size = 64
+    character_num = 10
+    character_embed = 60
+    cell_embed = 8
+
+    # (batch_size, row_num, col_num, character_num, character_embedding)
+    X_train = np.random.uniform(0, 1, (10, 16, 8, 10, 60))
+    X_test = np.random.uniform(0, 1, (10, 16, 8, 10, 60))
+    y_train = np.random.uniform(0, 1, (10, 16, 8))
+    y_test = np.random.uniform(0, 1, (10, 16, 8))
+
+    _input = Input(shape=input_shape, dtype="float32")
+    batch = K.shape(_input)[0]
+    height = K.shape(_input)[1]
+    width = K.shape(_input)[2]
+    print(batch, height, width)
+
+    loop_bi_lstm = LoopCell(hidden_size, attention_size,
+                            character_num, character_embed,
+                            cell_embed)(_input)
+    print("model_2_1", loop_bi_lstm)
+    dense = Dense(cell_embed, activation="relu")(loop_bi_lstm)
+    print("model_2_2", dense)
+    reshape = Lambda(batch_reshape, output_shape=(height, width, cell_embed))(dense)
+    print("model_2_3", reshape)
+    u_net = u_net_small(loop_bi_lstm)
+    merge = Concatenate(axis=-1)([loop_bi_lstm, u_net])
+    dense = Dense(LoopCell().cell_embed, activation='relu')(merge)
+    dense = Dense(1, activation='sigmoid')(dense)
+    squeeze = Lambda(lambda x: K.squeeze(x, axis=-1))(dense)
+    model = models.Model(inputs=_input, outputs=squeeze)
+    model.summary(line_length=120)
+    model.compile(loss='binary_crossentropy', optimizer='adam')
+    model.fit(X_train, y_train,
+              epochs=2,
+              batch_size=1,
+              validation_data=(X_test, y_test))
+
+    # (batch_size, row_num, col_num, character_num, character_embedding)
+    X_train = np.random.uniform(0, 1, (5, 32, 24, 10, 60))
+    X_test = np.random.uniform(0, 1, (5, 32, 24, 10, 60))
+    y_train = np.random.uniform(0, 1, (5, 32, 24))
+    y_test = np.random.uniform(0, 1, (5, 32, 24))
+
+    model.fit(X_train, y_train,
+              epochs=2,
+              batch_size=1,
+              validation_data=(X_test, y_test))

+ 237 - 7
BiddingKG/dl/table_head/models/model.py

@@ -1,16 +1,21 @@
 import sys
 import os
+import numpy as np
+from keras.layers import Lambda, Dense, Reshape, Bidirectional, LSTM, Conv2D, BatchNormalization, LeakyReLU, Masking
+from keras_preprocessing.sequence import pad_sequences
+sys.path.append(os.path.dirname(__file__))
 
-from keras.layers import Lambda
-
-sys.path.append(os.path.abspath("../.."))
-from keras import layers, models
+from models.layer_utils import BatchReshape1, BatchReshape2, MyPadding, MySplit, BatchReshape3, \
+    BatchReshape4, BatchReshape5, BatchReshape6
+from keras import layers, models, Sequential
 import keras.backend as K
-from BiddingKG.dl.table_head.models.my_average_pooling import MyAveragePooling1D
-from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
+import tensorflow as tf
+from models.my_average_pooling import MyAveragePooling1D
+from models.self_attention import SeqSelfAttention, MySelfAttention
+from models.u_net import u_net_small
 
 
-def get_model(input_shape, output_shape):
+def model_1(input_shape, output_shape):
     # Input (batch, 10, 60)
     input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
     input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
@@ -67,3 +72,228 @@ def get_model(input_shape, output_shape):
 
     model.summary()
     return model
+
+
+def model_2(input_shape, output_shape):
+    # input_shape = (None, None, 10, 60)
+    # (batch_size, row_num, col_num, character_num, character_embedding)
+    hidden_size = 64
+    attention_size = 64
+    character_num = 10
+    character_embed = 60
+    cell_embed = 1
+
+    # Input
+    input_1 = layers.Input(shape=input_shape, dtype="float32", name="input_1")
+    input_2 = layers.Input(shape=(None, None, None, None), dtype="int32", name="input_2")
+    # batch = tf.shape(_input)[0]
+    height = tf.shape(input_2)[1]
+    width = tf.shape(input_2)[2]
+    pad_height = tf.shape(input_2)[3]
+    pad_width = tf.shape(input_2)[4]
+
+    # print("batch, height, width", batch, height, width)
+
+    # Reshape
+    reshape = BatchReshape1(character_num, character_embed)(input_1)
+    print("model_2_0", reshape)
+
+    # Bi-LSTM + Attention
+    bi_lstm = Bidirectional(LSTM(hidden_size))(reshape)
+    print("model_2_1", bi_lstm)
+    # bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape)
+    # self_attention = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm)
+    # trans = Lambda(lambda x: tf.transpose(x, (0, 2, 1)))(self_attention)
+    # dense = Dense(1, activation='relu')(trans)
+    # squeeze = Lambda(lambda x: tf.squeeze(x, -1))(dense)
+
+    dense = Dense(1, activation="sigmoid")(bi_lstm)
+    print("model_2_2", dense)
+    # reshape = Lambda(batch_reshape, output_shape=(height, width, cell_embed))(dense)
+    reshape = BatchReshape2(cell_embed)([input_1, dense])
+    print("model_2_3", reshape)
+    # squeeze_1 = Lambda(lambda x: K.squeeze(x, axis=-1), name="output_1")(reshape)
+    # print("model_2_4", squeeze)
+
+    # Padding
+    padding = MyPadding(pad_height, pad_width, cell_embed)(reshape)
+    # padding = reshape
+    print("model_2_4", padding)
+
+    # U-Net
+    # u_net = u_net_small(padding)
+    # print("model_2_5", u_net)
+
+    # Conv 5*5
+    conv = Conv2D(1, (5, 5), padding='same')(padding)
+    bn = BatchNormalization()(conv)
+    relu = LeakyReLU(alpha=0.)(bn)
+
+    conv = Conv2D(1, (5, 5), padding='same')(relu)
+    bn = BatchNormalization()(conv)
+    relu = LeakyReLU(alpha=0.)(bn)
+
+    conv = Conv2D(1, (5, 5), padding='same')(relu)
+    bn = BatchNormalization()(conv)
+    relu_1 = LeakyReLU(alpha=0.)(bn)
+
+    # Conv 3*3
+    conv = Conv2D(1, (3, 3), padding='same')(padding)
+    bn = BatchNormalization()(conv)
+    relu = LeakyReLU(alpha=0.)(bn)
+
+    conv = Conv2D(1, (3, 3), padding='same')(relu)
+    bn = BatchNormalization()(conv)
+    relu = LeakyReLU(alpha=0.)(bn)
+
+    conv = Conv2D(1, (3, 3), padding='same')(relu)
+    bn = BatchNormalization()(conv)
+    relu_2 = LeakyReLU(alpha=0.)(bn)
+
+    # Conv 1*1
+    conv = Conv2D(1, (1, 1), padding='same')(padding)
+    bn = BatchNormalization()(conv)
+    relu = LeakyReLU(alpha=0.)(bn)
+
+    conv = Conv2D(1, (1, 1), padding='same')(relu)
+    bn = BatchNormalization()(conv)
+    relu = LeakyReLU(alpha=0.)(bn)
+
+    conv = Conv2D(1, (1, 1), padding='same')(relu)
+    bn = BatchNormalization()(conv)
+    relu_3 = LeakyReLU(alpha=0.)(bn)
+
+    # conv = Conv2D(cell_embed, (3, 3), padding='same')(relu)
+    # bn = BatchNormalization()(conv)
+    # relu_2 = LeakyReLU(alpha=0.)(bn)
+
+    # Merge
+    # print("model_2_5", relu_1, relu_2)
+    merge = layers.Concatenate(axis=-1)([relu_1, relu_2, relu_3])
+    # merge = u_net
+    # merge = relu
+    dense = layers.Dense(1, activation='sigmoid')(merge)
+    squeeze_2 = Lambda(lambda x: K.squeeze(x, axis=-1))(dense)
+
+    # Split
+    split = MySplit(height, width, name="output")(squeeze_2)
+
+    model = models.Model(inputs=[input_1, input_2], outputs=split)
+    model.summary(line_length=120)
+    return model
+
+
+def model_3(input_shape, output_shape):
+    # (batch_size, row_num, col_num, character_num, character_embedding)
+
+    hidden_size = 16
+    attention_size = 2*hidden_size
+    character_num = 20
+    character_embed = 60
+    cell_embed = 2*hidden_size
+    pad_len = 100
+    mask_timestamps = pad_len
+
+    # Input
+    input_1 = layers.Input(shape=input_shape, dtype="float32", name="input_1")
+    input_2 = layers.Input(shape=(None, None, None, None), dtype="int32", name="input_2")
+
+    # Reshape
+    reshape = BatchReshape1(character_num, character_embed)(input_1)
+    print("model_2_0", reshape)
+
+    # Bi-LSTM
+    bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape)
+    bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=False))(bi_lstm)
+    print("model_2_1", bi_lstm)
+
+    # Reshape
+    reshape = BatchReshape2(cell_embed)([input_1, bi_lstm])
+    print("model_2_3", reshape)
+
+    # Rows Reshape
+    reshape_1 = BatchReshape3(cell_embed)(reshape)
+
+    # Cols Reshape
+    trans = Lambda(lambda x: tf.transpose(x, (0, 2, 1, 3)))(reshape)
+    reshape_2 = BatchReshape3(cell_embed)(trans)
+
+    # All boxes Reshape
+    reshape_3 = BatchReshape5(cell_embed)(reshape)
+
+    # Masking
+    # mask_1 = Masking(mask_value=-1, input_shape=(mask_timestamps, cell_embed))(pad_1)
+    # mask_2 = Masking(mask_value=-1, input_shape=(mask_timestamps, cell_embed))(pad_2)
+    # print("model_2_4", mask_1)
+
+    # Padding
+    # pad_1 = MyPadding()
+
+    # Bi-LSTM
+    # bi_lstm = Bidirectional(LSTM(hidden_size, return_sequences=True))
+    # bi_lstm_1 = bi_lstm(reshape_1)
+    # bi_lstm_2 = bi_lstm(reshape_2)
+    bi_lstm_1 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_1)
+    bi_lstm_2 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_2)
+    # bi_lstm_1 = LSTM(2*hidden_size, return_sequences=True)(reshape_1)
+    # print("model_2_4", bi_lstm_1)
+    # bi_lstm_2 = LSTM(2*hidden_size, return_sequences=True)(reshape_2)
+    # self_attention_1 = MySelfAttention(output_dim=attention_size)(bi_lstm_1)
+    # self_attention_2 = MySelfAttention(output_dim=attention_size)(bi_lstm_2)
+
+    # Bi-LSTM + Attention
+    bi_lstm_3 = Bidirectional(LSTM(hidden_size, return_sequences=True))(reshape_3)
+    # bi_lstm_3 = LSTM(2*hidden_size, return_sequences=True)(reshape_3)
+    # self_attention_3 = MySelfAttention(output_dim=attention_size)(bi_lstm_3)
+    # print("model_2_5", bi_lstm_1)
+
+    # Reshape
+    reshape_1 = BatchReshape4(cell_embed)([reshape, bi_lstm_1])
+    reshape_2 = BatchReshape4(cell_embed)([trans, bi_lstm_2])
+    reshape_2 = Lambda(lambda x: tf.transpose(x, (0, 2, 1, 3)))(reshape_2)
+    reshape_3 = BatchReshape6(cell_embed)([reshape, bi_lstm_3])
+    print("model_2_6", reshape_1)
+
+    # Merge
+    merge = layers.Concatenate(axis=-1)([reshape, reshape_1, reshape_2, reshape_3])
+    dense = layers.Dense(hidden_size, activation='relu')(merge)
+    dense = layers.Dense(1, activation='sigmoid')(dense)
+    squeeze = Lambda(lambda x: K.squeeze(x, axis=-1), name="output")(dense)
+
+    model = models.Model(inputs=[input_1, input_2], outputs=squeeze)
+    model.summary(line_length=110)
+    return model
+
+
+def get_model(input_shape, output_shape, model_id):
+    if model_id == 1:
+        return model_1(input_shape, output_shape)
+    elif model_id == 2:
+        return model_2(input_shape, output_shape)
+    elif model_id == 3:
+        return model_3(input_shape, output_shape)
+    else:
+        print("No such model!")
+        raise Exception()
+
+
+def test_layer():
+    model = Sequential()
+    model.add(Masking(mask_value=-1, input_shape=(5, 8)))
+    model.add(Lambda(lambda x: pad_sequences(x, maxlen=100, dtype='float32',
+                                             padding='post', truncating='post',
+                                             value=-1)))
+    model.add(Masking(mask_value=-1, input_shape=(5, 8)))
+    model.add(LSTM(32, return_sequences=True))
+
+    model.compile(optimizer='sgd', loss='mse')
+
+    x = np.zeros([1, 5, 8])
+    print(x.shape)
+    y = np.zeros([1, 5, 32])
+    model.summary()
+    model.fit(x, y, batch_size=32, epochs=10)
+
+
+if __name__ == "__main__":
+    test_layer()

+ 58 - 0
BiddingKG/dl/table_head/models/model_2.py

@@ -0,0 +1,58 @@
+import sys
+import os
+sys.path.append(os.path.abspath("../.."))
+from keras import layers, models
+import tensorflow as tf
+from BiddingKG.dl.table_head.models.my_average_pooling import MyAveragePooling1D
+from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
+
+
+def get_model(input_shape, output_shape):
+    # Input
+    input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_3 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_4 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_5 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_6 = layers.Input(shape=input_shape[1:], dtype="float32")
+
+    # Bi-LSTM
+    bi_lstm_1 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_1)
+    bi_lstm_2 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_2)
+    bi_lstm_3 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_3)
+    bi_lstm_4 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_4)
+    bi_lstm_5 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_5)
+    bi_lstm_6 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_6)
+
+    # Self-Attention
+    self_attention_1 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_1)
+    self_attention_2 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_2)
+    self_attention_3 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_3)
+    self_attention_4 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_4)
+    self_attention_5 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_5)
+    self_attention_6 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_6)
+
+    # Concat
+    concat_1 = layers.concatenate([self_attention_1, self_attention_2, self_attention_3])
+    concat_2 = layers.concatenate([self_attention_4, self_attention_5, self_attention_6])
+
+    # Dense + Sigmoid
+    dense_1 = layers.Dense(output_shape[0], activation="sigmoid")(concat_1)
+    dense_2 = layers.Dense(output_shape[0], activation="sigmoid")(concat_2)
+
+    # mask mean pooling
+    pool_1 = MyAveragePooling1D(axis=1)(dense_1)
+    pool_2 = MyAveragePooling1D(axis=1)(dense_2)
+
+    # Concat
+    concat = layers.concatenate([pool_1, pool_2])
+
+    # Dense
+    output = layers.Dense(10)(concat)
+    output = layers.Dense(1, activation="sigmoid", name='output')(output)
+
+    model = models.Model(inputs=[input_1, input_2, input_3, input_4, input_5, input_6],
+                         outputs=output)
+
+    model.summary()
+    return model

+ 40 - 1
BiddingKG/dl/table_head/models/self_attention.py

@@ -1,5 +1,6 @@
 import keras
 from keras import backend as K
+from keras.layers import Layer
 
 
 class SeqSelfAttention(keras.layers.Layer):
@@ -237,4 +238,42 @@ class SeqSelfAttention(keras.layers.Layer):
 
     @staticmethod
     def get_custom_objects():
-        return {'SeqSelfAttention': SeqSelfAttention}
+        return {'SeqSelfAttention': SeqSelfAttention}
+
+
+class MySelfAttention(Layer):
+    def __init__(self, output_dim, **kwargs):
+        self.output_dim = output_dim
+        super(MySelfAttention, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # inputs.shape = (batch_size, time_steps, seq_len)
+        self.W_Q = self.add_weight(name='W_Q',
+                                   shape=(input_shape[2], self.output_dim),
+                                   initializer='uniform',
+                                   trainable=True)
+        self.W_K = self.add_weight(name='W_K',
+                                   shape=(input_shape[2], self.output_dim),
+                                   initializer='uniform',
+                                   trainable=True)
+        self.W_V = self.add_weight(name='W_V',
+                                   shape=(input_shape[2], self.output_dim),
+                                   initializer='uniform',
+                                   trainable=True)
+
+        super(MySelfAttention, self).build(input_shape)
+
+    def call(self, x, mask=None, **kwargs):
+        _Q = K.dot(x, self.W_Q)
+        _K = K.dot(x, self.W_K)
+        _V = K.dot(x, self.W_V)
+
+        # batch_dot代替K.T
+        _Z = K.batch_dot(_Q, K.permute_dimensions(_K, [0, 2, 1]))
+        _Z = _Z / (self.output_dim**0.5)
+        _Z = K.softmax(_Z)
+        _Z = K.batch_dot(_Z, _V)
+        return _Z
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[0], input_shape[1], self.output_dim

+ 49 - 0
BiddingKG/dl/table_head/models/tf_bi_lstm.py

@@ -0,0 +1,49 @@
+import tensorflow as tf
+from tensorflow.contrib.rnn import LSTMCell
+from tensorflow.contrib.rnn import MultiRNNCell
+
+
+class LstmBase:
+    """
+    build rnn cell
+    """
+    def build_rnn(self, hidden_size, num_layes):
+        cells = []
+        for i in range(num_layes):
+            cell = LSTMCell(num_units=hidden_size,
+                            state_is_tuple=True,
+                            initializer=tf.random_uniform_initializer(-0.25, 0.25))
+            cells.append(cell)
+        cells = MultiRNNCell(cells, state_is_tuple=True)
+
+        return cells
+
+
+class BiLstm(LstmBase):
+    """
+    define the lstm
+    """
+    def __init__(self, scope_name, hidden_size, num_layers):
+        super(BiLstm, self).__init__()
+        assert hidden_size % 2 == 0
+        hidden_size /= 2
+
+        self.fw_rnns = []
+        self.bw_rnns = []
+        for i in range(num_layers):
+            self.fw_rnns.append(self.build_rnn(hidden_size, 1))
+            self.bw_rnns.append(self.build_rnn(hidden_size, 1))
+
+        self.scope_name = scope_name
+
+    def __call__(self, input, input_len):
+        for idx, (fw_rnn, bw_rnn) in enumerate(zip(self.fw_rnns, self.bw_rnns)):
+            scope_name = '{}_{}'.format(self.scope_name, idx)
+            ctx, _ = tf.nn.bidirectional_dynamic_rnn(
+                fw_rnn, bw_rnn, input, sequence_length=input_len,
+                dtype=tf.float32, time_major=False,
+                scope=scope_name
+            )
+            input = tf.concat(ctx, -1)
+        ctx = input
+        return ctx

+ 82 - 0
BiddingKG/dl/table_head/models/u_net.py

@@ -0,0 +1,82 @@
+from keras.layers import Input, concatenate, Conv2D, MaxPooling2D, BatchNormalization, UpSampling2D
+from keras.layers import LeakyReLU
+
+
+def u_net_small(inputs, num_classes=1):
+    # 8
+    use_bias = False
+    down0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(inputs)
+    down0 = BatchNormalization()(down0)
+    down0 = LeakyReLU(alpha=0.)(down0)
+    down0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(down0)
+    down0 = BatchNormalization()(down0)
+    down0 = LeakyReLU(alpha=0.)(down0)
+    down0_pool = MaxPooling2D((2, 2), strides=(2, 2))(down0)
+
+    # 4
+    down1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(down0_pool)
+    down1 = BatchNormalization()(down1)
+    down1 = LeakyReLU(alpha=0.)(down1)
+    down1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(down1)
+    down1 = BatchNormalization()(down1)
+    down1 = LeakyReLU(alpha=0.)(down1)
+    down1_pool = MaxPooling2D((2, 2), strides=(2, 2))(down1)
+
+    # 2
+    down2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(down1_pool)
+    down2 = BatchNormalization()(down2)
+    down2 = LeakyReLU(alpha=0.)(down2)
+    down2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(down2)
+    down2 = BatchNormalization()(down2)
+    down2 = LeakyReLU(alpha=0.)(down2)
+    down2_pool = MaxPooling2D((2, 2), strides=(2, 2))(down2)
+
+    center = Conv2D(64, (3, 3), padding='same', use_bias=use_bias)(down2_pool)
+    center = BatchNormalization()(center)
+    center = LeakyReLU(alpha=0.)(center)
+    center = Conv2D(64, (3, 3), padding='same', use_bias=use_bias)(center)
+    center = BatchNormalization()(center)
+    center = LeakyReLU(alpha=0.)(center)
+
+    # 2
+    up2 = UpSampling2D((2, 2))(center)
+    up2 = concatenate([down2, up2], axis=3)
+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
+    up2 = BatchNormalization()(up2)
+    up2 = LeakyReLU(alpha=0.)(up2)
+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
+    up2 = BatchNormalization()(up2)
+    up2 = LeakyReLU(alpha=0.)(up2)
+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
+    up2 = BatchNormalization()(up2)
+    up2 = LeakyReLU(alpha=0.)(up2)
+
+    # 4
+    up1 = UpSampling2D((2, 2))(up2)
+    up1 = concatenate([down1, up1], axis=3)
+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
+    up1 = BatchNormalization()(up1)
+    up1 = LeakyReLU(alpha=0.)(up1)
+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
+    up1 = BatchNormalization()(up1)
+    up1 = LeakyReLU(alpha=0.)(up1)
+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
+    up1 = BatchNormalization()(up1)
+    up1 = LeakyReLU(alpha=0.)(up1)
+
+    # 8
+    up0 = UpSampling2D((2, 2))(up1)
+    up0 = concatenate([down0, up0], axis=3)
+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
+    up0 = BatchNormalization()(up0)
+    up0 = LeakyReLU(alpha=0.)(up0)
+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
+    up0 = BatchNormalization()(up0)
+    up0 = LeakyReLU(alpha=0.)(up0)
+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
+    up0 = BatchNormalization()(up0)
+    up0 = LeakyReLU(alpha=0.)(up0)
+
+    # classify
+    # classify = Conv2D(num_classes, (1, 1), activation='sigmoid')(up0)
+    return up0

+ 18 - 0
BiddingKG/dl/table_head/post_process.py

@@ -24,3 +24,21 @@ def table_post_process(table_text_list, predict_result, threshold=0.5):
         print("table_post_process 输出label维度与text不一致!")
         table_label_list = []
     return table_label_list
+
+
+def table_post_process_2(table_text_list, predict_result, threshold=0.5):
+    predict_result = predict_result.tolist()[0]
+    predict_list = []
+    for row in predict_result:
+        new_row = []
+        for col in row:
+            if col >= threshold:
+                new_row.append("1")
+            else:
+                new_row.append("0")
+        predict_list.append(new_row)
+
+    if len(predict_list) != len(predict_result):
+        print("table_post_process 输出label维度与text不一致!")
+        predict_list = []
+    return predict_list

+ 22 - 16
BiddingKG/dl/table_head/postgresql2csv.py

@@ -21,23 +21,29 @@ def eval_text_list(table_text):
 def read_postgresql(txt_name, start_id, _time):
     conn = psycopg2.connect(database="table_head_label", user="postgres",
                             password="postgres", host="192.168.2.103", port="5432")
-
-    with open('check_user_result/' + txt_name, "r") as f:
-        id_list = f.readlines()
-    # with open('check_user_result/test27.txt', "r") as f:
-    #     id_list += f.readlines()
-
-    _list = []
-    for _id in id_list:
-        _id = _id[:-1]
-        sql = 'select * from label_table_head_info where id =' + _id
+    row_list = []
+    if txt_name == "":
+        sql = """
+        select * from "label_table_head_info" 
+        where status = 1 and update_time >= '2022-01-17';
+        """
         df = pd.read_sql(sql=sql, con=conn)
-        # df = df[0]
         for index, row in df.iterrows():
-            _list.append([x for x in row])
-    cnt = 0
+            row_list.append([x for x in row])
+    else:
+        with open('check_user_result/' + txt_name, "r") as f:
+            id_list = f.readlines()
+        for _id in id_list:
+            _id = _id[:-1]
+            sql = 'select * from label_table_head_info where id =' + _id
+            df = pd.read_sql(sql=sql, con=conn)
+            # df = df[0]
+            for index, row in df.iterrows():
+                row_list.append([x for x in row])
+        cnt = 0
+
     new_list = []
-    for line in _list:
+    for line in row_list:
         try:
             table_text = eval_text_list(line[2])
         except:
@@ -57,7 +63,7 @@ def read_postgresql(txt_name, start_id, _time):
         label_list = predict(table_text)
         line[3] = str(label_list)
         new_list.append(line)
-    df = pd.DataFrame(_list)
+    df = pd.DataFrame(new_list)
     new_csv_path = "data_new.csv"
 
     df.to_csv(new_csv_path, index=False)
@@ -66,7 +72,7 @@ def read_postgresql(txt_name, start_id, _time):
 
 
 if __name__ == '__main__':
-    new_csv_path = read_postgresql('test20_error.txt', 203995, '2022-01-01 00:00:00')
+    new_csv_path = read_postgresql('test11_error.txt', 206863, '2021-12-31 00:00:00')
     # new_csv_path = read_postgresql('test20_right.txt', 203995, '')
     # df = pd.read_csv('data_new.csv')
     # print(df.iloc[:, 4])

+ 241 - 29
BiddingKG/dl/table_head/pre_process.py

@@ -1,8 +1,10 @@
+import os
 import random
-
+import sys
 import psycopg2
 import numpy as np
-from BiddingKG.dl.common.Utils import embedding_word
+sys.path.append(os.path.dirname(__file__) + "/../")
+from common.Utils import embedding_word, embedding_word_forward
 
 
 def get_sentence_index_list(sentence, dict_path='utils/ppocr_keys_v1.txt'):
@@ -42,22 +44,28 @@ def postgresql_util(sql, limit):
     return all_rows
 
 
-def get_data_from_sql(dim=10):
+def get_data_from_sql(dim=10, whole_table=False, padding=True):
+    sql = """
+    select table_text, pre_label, post_label, id
+    from label_table_head_info
+    where status = 0 and (update_user='test9' or update_user='test1' or update_user='test7' or update_user='test26')
+    ;
+    """
     # sql = """
     # select table_text, pre_label, post_label, id
     # from label_table_head_info
-    # where update_user <> 'test27' and update_user <> 'test20' and table_box_cnt >= 4 and table_box_cnt <= 200
+    # where status = 1 and update_time >= '2022-01-17' and update_time <= '2022-01-22'
     # ;
     # """
-    sql = """
-    select table_text, pre_label, post_label, id
-    from label_table_head_info 
-    where status = 1 and update_time >= '2022-01-17'
-    ;
-    """
 
     result_list = postgresql_util(sql, limit=1000000)
 
+    # 需排除的id
+    with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
+        delete_id_list = eval(f.read())
+    with open(r"C:\Users\Administrator\Desktop\table_delete.txt", "r") as f:
+        delete_id_list += eval(f.read())
+
     all_data_list = []
     all_data_label_list = []
     i = 0
@@ -71,6 +79,10 @@ def get_data_from_sql(dim=10):
         post_label = eval(table[2])
         _id = table[3]
 
+        if _id in delete_id_list:
+            print("pass", _id)
+            continue
+
         # table_text需要特殊处理
         try:
             table_text = table[0]
@@ -84,17 +96,35 @@ def get_data_from_sql(dim=10):
             print("无法识别table_text", _id)
             continue
 
-        # 只有一行的也不要
-        if len(post_label) >= 2:
-            data_list, data_label_list = table_pre_process(table_text, post_label, _id)
-        elif len(pre_label) >= 2:
-            data_list, data_label_list = table_pre_process(table_text, pre_label, _id)
+        if whole_table:
+            if len(post_label) >= 2:
+                data_list, data_label_list = table_pre_process_2(table_text, post_label,
+                                                                 _id, padding=padding)
+            elif len(pre_label) >= 2:
+                data_list, data_label_list = table_pre_process_2(table_text, pre_label,
+                                                                 _id, padding=padding)
+            else:
+                data_list, data_label_list = [], []
         else:
-            data_list, data_label_list = [], []
+            # 只有一行的也不要
+            if len(post_label) >= 2:
+                data_list, data_label_list = table_pre_process(table_text, post_label, _id)
+            elif len(pre_label) >= 2:
+                data_list, data_label_list = table_pre_process(table_text, pre_label, _id)
+            else:
+                data_list, data_label_list = [], []
 
         all_data_list += data_list
         all_data_label_list += data_label_list
 
+    # 按维度大小排序
+    if whole_table:
+        _list = []
+        for data, label in zip(all_data_list, all_data_label_list):
+            _list.append([data, label])
+        _list.sort(key=lambda x: (len(x[0]), len(x[0][0])))
+        all_data_list[:], all_data_label_list[:] = zip(*_list)
+
     print("len(all_data_list)", len(all_data_list))
     return all_data_list, all_data_label_list
 
@@ -206,7 +236,84 @@ def table_pre_process(text_list, label_list, _id, is_train=True):
         return data_list
 
 
-def get_data_from_file(file_type):
+def table_pre_process_2(text_list, label_list, _id, is_train=True, padding=True):
+    """
+    表格处理,整个表格为一个数组,且填充长宽维度
+
+    :param text_list:
+    :param label_list:
+    :param _id:
+    :param is_train:
+    :return:
+    """
+    # 判断表格长宽是否合理
+    row_len = len(text_list)
+    best_row_len = get_best_padding_size(row_len, min_len=8)
+    col_len = len(text_list[0])
+    best_col_len = get_best_padding_size(col_len, min_len=8)
+    if best_row_len is None:
+        if is_train:
+            return [], []
+        else:
+            return []
+    if best_col_len is None:
+        if is_train:
+            return [], []
+        else:
+            return []
+
+    if is_train:
+        if len(text_list) != len(label_list):
+            print("文字单元格与标注单元格数量不匹配!", _id)
+            print("len(text_list)", len(text_list), "len(label_list)", len(label_list))
+            return [], []
+
+        if padding:
+            for i in range(row_len):
+                col_len = len(text_list[i])
+                text_list[i] += [None]*(best_col_len-col_len)
+                if is_train:
+                    label_list[i] += ["0"]*(best_col_len-col_len)
+            text_list += [[None]*best_col_len]*(best_row_len-row_len)
+            if is_train:
+                label_list += [["0"]*best_col_len]*(best_row_len-row_len)
+
+    if is_train:
+        for i in range(len(label_list)):
+            for j in range(len(label_list[i])):
+                label_list[i][j] = int(label_list[i][j])
+        return [text_list], [label_list]
+    else:
+        return [text_list]
+
+
+def get_best_padding_size(axis_len, min_len=3, max_len=300):
+    # sizes = [8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
+    #          128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224,
+    #          232, 240, 248, 256, 264, 272, 280, 288, 296]
+    # sizes = [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57,
+    #          60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99, 102, 105, 108, 111,
+    #          114, 117, 120, 123, 126, 129, 132, 135, 138, 141, 144, 147, 150, 153, 156,
+    #          159, 162, 165, 168, 171, 174, 177, 180, 183, 186, 189, 192, 195, 198, 201,
+    #          204, 207, 210, 213, 216, 219, 222, 225, 228, 231, 234, 237, 240, 243, 246,
+    #          249, 252, 255, 258, 261, 264, 267, 270, 273, 276, 279, 282, 285, 288, 291,
+    #          294, 297]
+    sizes = []
+    for i in range(1, max_len):
+        if i * min_len <= max_len:
+            sizes.append(i * min_len)
+    if axis_len > sizes[-1]:
+        return axis_len
+    best_len = sizes[-1]
+    for height in sizes:
+        if axis_len <= height:
+            best_len = height
+            break
+    # print("get_best_padding_size", axis_len, best_len)
+    return best_len
+
+
+def get_data_from_file(file_type, model_id=1):
     if file_type == 'np':
         data_path = 'train_data/data_3.npy'
         data_label_path = 'train_data/data_label_3.npy'
@@ -215,17 +322,20 @@ def get_data_from_file(file_type):
         array2 = np.load(data_label_path)
         return array1, array2
     elif file_type == 'txt':
-        data_path = 'train_data/data3.txt'
-        data_label_path = 'train_data/data_label3.txt'
-
+        if model_id == 1:
+            data_path = 'train_data/data1.txt'
+            data_label_path = 'train_data/data_label1.txt'
+        elif model_id == 2:
+            data_path = 'train_data/data2.txt'
+            data_label_path = 'train_data/data_label2.txt'
+        elif model_id == 3:
+            data_path = 'train_data/data3.txt'
+            data_label_path = 'train_data/data_label3.txt'
         with open(data_path, 'r') as f:
             data_list = f.readlines()
         with open(data_label_path, 'r') as f:
             data_label_list = f.readlines()
 
-        # for i in range(len(data_list)):
-        #     data_list[i] = eval(data_list[i][:-1])
-        #     data_label_list[i] = eval(data_label_list[i][:-1])
         return data_list, data_label_list
     else:
         print("file type error! only np and txt supported")
@@ -245,18 +355,19 @@ def processed_save_to_np():
     #         f.write(str(line) + "\n")
 
 
-def processed_save_to_txt():
-    list1, list2 = get_data_from_sql()
+def processed_save_to_txt(whole_table=False, padding=True):
+    list1, list2 = get_data_from_sql(whole_table=whole_table, padding=padding)
 
     # 打乱
+    # if not whole_table or not padding:
     zip_list = list(zip(list1, list2))
     random.shuffle(zip_list)
     list1[:], list2[:] = zip(*zip_list)
 
-    with open('train_data/data3.txt', 'w') as f:
+    with open('train_data/data1.txt', 'w') as f:
         for line in list1:
             f.write(str(line) + "\n")
-    with open('train_data/data_label3.txt', 'w') as f:
+    with open('train_data/data_label1.txt', 'w') as f:
         for line in list2:
             f.write(str(line) + "\n")
 
@@ -287,7 +398,7 @@ def my_data_loader(data_list, data_label_list, batch_size, is_train=True):
     data_num = len(data_list)
 
     # 定义Embedding输出
-    output_shape = (6, 10, 60)
+    output_shape = (6, 20, 60)
 
     # batch循环取数据
     i = 0
@@ -349,8 +460,109 @@ def my_data_loader(data_list, data_label_list, batch_size, is_train=True):
                    'input_4': X[3], 'input_5': X[4], 'input_6': X[5], }
 
 
+def my_data_loader_2(table_list, table_label_list, batch_size, is_train=True):
+    pad_len = 0
+
+    table_num = len(table_list)
+    if is_train and batch_size == 1:
+        table_list, table_label_list = get_random(table_list, table_label_list)
+
+    # Embedding shape
+    output_shape = (20, 60)
+
+    # batch循环取数据
+    i = 0
+    last_shape = None
+    while True:
+        new_table_list = []
+        new_table_label_list = []
+        for j in range(batch_size):
+            if i >= table_num:
+                i = 0
+                if is_train:
+                    table_list, table_label_list = get_random(table_list, table_label_list,
+                                                              seed=random.randint(1, 40))
+
+            if type(table_list[i]) != list:
+                table = eval(table_list[i][:-1])
+            else:
+                table = table_list[i]
+
+            if batch_size > 1:
+                if last_shape is None:
+                    last_shape = (len(table), len(table[0]))
+                    continue
+                if (len(table), len(table[0])) != last_shape:
+                    last_shape = (len(table), len(table[0]))
+                    break
+
+            if is_train:
+                table_label = eval(table_label_list[i][:-1])
+
+            # 中文字符映射为Embedding
+            for k in range(len(table)):
+                table[k] = embedding_word_forward(table[k], (len(table[k]),
+                                                     output_shape[0],
+                                                     output_shape[1]))
+            new_table_list.append(table)
+            if is_train:
+                new_table_label_list.append(table_label)
+            i += 1
+        new_table_list = np.array(new_table_list)
+        X = new_table_list
+        if X.shape[-2:] != output_shape:
+            # print("Dimension not match!", X.shape)
+            # print("\n")
+            continue
+
+        # 获取Padding大小
+        pad_height = get_best_padding_size(X.shape[1], pad_len)
+        pad_width = get_best_padding_size(X.shape[2], pad_len)
+        input_2 = np.zeros([1, X.shape[1], X.shape[2], pad_height, pad_width])
+
+        if is_train:
+            new_table_label_list = np.array(new_table_label_list)
+            Y = new_table_label_list
+            # Y = Y.astype(np.float32)
+            # yield {"input_1": X, "input_2": input_2}, \
+            #       {"output_1": Y, "output_2": Y}
+            yield {"input_1": X, "input_2": input_2}, \
+                  {"output": Y}
+        else:
+            yield {"input_1": X, "input_2": input_2}
+
+
+def check_train_data():
+    data_list, label_list = get_data_from_file('txt', model_id=2)
+    for data in data_list:
+        data = eval(data)
+        if len(data) % 8 != 0:
+            print(len(data))
+            print(len(data[0]))
+        for row in data:
+            if len(row) % 8 != 0:
+                print(len(data))
+                print(len(row))
+
+
+def get_random(text_list, label_list, seed=42):
+    random.seed(seed)
+    zip_list = list(zip(text_list, label_list))
+    random.shuffle(zip_list)
+    text_list[:], label_list[:] = zip(*zip_list)
+    return text_list, label_list
+
+
 if __name__ == '__main__':
-    processed_save_to_txt()
+    processed_save_to_txt(whole_table=False, padding=False)
     # data_balance()
 
     # test_embedding()
+    # check_train_data()
+
+    # _list = []
+    # for i in range(1, 100):
+    #     _list.append(i*3)
+    # print(_list)
+
+    # print(get_best_padding_size(9, 5))

Plik diff jest za duży
+ 121 - 82
BiddingKG/dl/table_head/predict.py


+ 17 - 0
BiddingKG/dl/table_head/preprocessing_test.py

@@ -0,0 +1,17 @@
+import codecs
+import pandas as pd
+from bs4 import BeautifulSoup
+from BiddingKG.dl.interface.extract import predict
+
+
+def test():
+    df = pd.read_excel("has_table_no_attach.xlsx")
+    for index, row in df.iterrows():
+        if index % 100 == 0:
+            print("Loop", index)
+        text = row['dochtmlcon']
+        predict(str(index), text)
+
+
+if __name__ == "__main__":
+    test()

+ 188 - 0
BiddingKG/dl/table_head/table_simplify.py

@@ -0,0 +1,188 @@
+#coding:utf-8
+import json
+import logging
+
+from BiddingKG.dl.table_head.pre_process import postgresql_util
+
+
+user_score = {
+    "test": 1.,
+    "test1": 0.83,
+    "test11": 0.82,
+    "test12": 0.74,
+    "test16": 0.83,
+    "test17": 0.77,
+    "test19": 0.79,
+    "test20": 0.82,
+    "test21": 0.73,
+    "test22": 0.64,
+    "test25": 0.77,
+    "test26": 0.80,
+    "test27": 0.72,
+    "test29": 0.8,
+    "test3": 0.,
+    "test7": 0.82,
+    "test8": 0.78,
+    "test9": 0.80,
+}
+
+
+def get_labeled_table():
+    sql = """
+    select id, update_user, table_text, pre_label, post_label
+    from label_table_head_info where status = 0
+    """
+
+    result_list = postgresql_util(sql, limit=1000000)
+    print("len(result_list)", len(result_list))
+    with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
+        not_eval_table_list = f.read()
+    not_eval_table_list = eval(not_eval_table_list)
+
+    table_list = []
+    # not_eval_table_list = []
+    for table in result_list:
+        pre_label = eval(table[3])
+        post_label = eval(table[4])
+        _id = table[0]
+        update_user = table[1]
+        table_text = table[2]
+        if _id in not_eval_table_list:
+            continue
+
+        try:
+            if table_text[0] == '"':
+                table_text = eval(table_text)
+            else:
+                table_text = table_text
+            table_text = table_text.replace('\\', '/')
+            table_text = eval(table_text)
+        except:
+            print("无法识别table_text", _id)
+            not_eval_table_list.append(_id)
+            continue
+
+        if post_label:
+            label_list = post_label
+        else:
+            label_list = pre_label
+
+        table_list.append([table_text, label_list, update_user, _id])
+    print("len(table_list)", len(table_list))
+    # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "w") as f:
+    #     f.write(str(not_eval_table_list))
+    return table_list
+
+
+def table_distance(table1, table2, thresh=0.85):
+    # flatten
+    table1 = [col for row in table1 for col in row]
+    table2 = [col for row in table2 for col in row]
+    while "" in table1:
+        table1.remove("")
+    while "" in table2:
+        table2.remove("")
+
+    equal_cnt = 0
+    not_equal_cnt = 0
+    equal_flag = 0
+    for col1 in table1:
+        find_flag = 0
+        for col2 in table2:
+            if col1 == col2:
+                equal_cnt += 1
+                find_flag = 1
+                break
+        if not find_flag:
+            not_equal_cnt += 1
+        # print(equal_cnt, not_equal_cnt)
+        if round(equal_cnt / max(len(table1), len(table2)), 2) >= thresh:
+            # print("> thresh")
+            equal_flag = 1
+            break
+        if round(not_equal_cnt / max(len(table1), len(table2)), 2) >= 1-thresh:
+            # print("> 1-thresh")
+            equal_flag = 0
+            break
+    return equal_flag
+
+
+def remove_duplicate(table_list):
+    logging.info("into remove_duplicate")
+    table_list.sort(key=lambda x: x[0])
+    delete_table_id_list = []
+    for i in range(len(table_list)):
+        delete_table_id_list = list(set(delete_table_id_list))
+        if i % 1000 == 0:
+            print("Loop", i, "len(delete_table_id_list)", len(delete_table_id_list))
+            logging.info("*")
+            with open(r"C:\Users\Administrator\Desktop\table_delete.txt", "w") as f:
+                f.write(str(delete_table_id_list))
+        table1 = table_list[i]
+        if len(table1[0]) <= 2 and len(table1[0][0]) <= 2:
+            delete_table_id_list.append(table1[3])
+            continue
+        for j in range(i+1, len(table_list)):
+            table2 = table_list[j]
+            if len(table2[0]) <= 2 and len(table2[0][0]) <= 2:
+                delete_table_id_list.append(table2[3])
+                continue
+            # 行数相差2以上忽略
+            if abs(len(table1[0]) - len(table2[0])) >= 2:
+                continue
+            # 列数相差2以上忽略
+            if abs(len(table1[0][0])) - len(table2[0][0]) >= 2:
+                continue
+            if table_distance(table1[0], table2[0]):
+                print("equal", table1[3], table2[3])
+                score1 = user_score.get(table1[2])
+                score2 = user_score.get(table2[2])
+                if score1 is None:
+                    score1 = 0.
+                if score2 is None:
+                    score2 = 0.
+                if score1 >= score2:
+                    delete_table_id_list.append(table2[3])
+                else:
+                    delete_table_id_list.append(table1[3])
+
+    delete_table_id_list = list(set(delete_table_id_list))
+    new_table_list = []
+    for table in table_list:
+        if table[3] not in delete_table_id_list:
+            new_table_list.append(table)
+    return new_table_list
+
+
+def eval_table(_str):
+    try:
+        if _str[0] == '"':
+            table_text = eval(_str)
+        else:
+            table_text = _str
+        table_text = table_text.replace('\\', '/')
+        table_text = eval(table_text)
+    except:
+        print("无法识别table_text")
+        table_text = ""
+    return table_text
+
+
+if __name__ == '__main__':
+    _list = get_labeled_table()
+    _list = remove_duplicate(_list)
+    _str = json.dumps(str(_list))
+    with open(r"C:\Users\Administrator\Desktop\table_simplify.txt", "w") as f:
+        f.write(_str)
+
+    # _str1 = "[['', '', 'Yes']]"
+    # _str2 = "[['', '', 'Yes', '']]"
+    # table1 = eval_table(_str1)
+    # table2 = eval_table(_str2)
+    #
+    # print(table_distance(table1, table2))
+
+    # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
+    #     not_eval_table_list = f.read()
+    # print(not_eval_table_list)
+    # not_eval_table_list = eval(not_eval_table_list)

+ 57 - 33
BiddingKG/dl/table_head/train.py

@@ -2,24 +2,40 @@ import sys
 import os
 sys.path.append(os.path.abspath("../../.."))
 os.environ['KERAS_BACKEND'] = 'tensorflow'
-from keras.metrics import categorical_accuracy
+from BiddingKG.dl.table_head.models.layer_utils import MyModelCheckpoint
 from BiddingKG.dl.table_head.metrics import precision, recall, f1
 from keras import optimizers, Model
 from BiddingKG.dl.table_head.models.model import get_model
 from BiddingKG.dl.table_head.loss import focal_loss
 from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
-from BiddingKG.dl.table_head.pre_process import get_data_from_file, get_data_from_sql, my_data_loader
+from BiddingKG.dl.table_head.pre_process import get_data_from_file, get_data_from_sql, my_data_loader, my_data_loader_2, \
+    get_random
 from keras import backend as K
 
-
-input_shape = (6, 10, 60)
-output_shape = (1,)
-batch_size = 32
-epochs = 1000
-pretrained_path = "checkpoints/best.hdf5"
-checkpoint_path = "checkpoints/"
-PRETRAINED = True
-CHECKPOINT = False
+model_id = 1
+
+if model_id == 1:
+    input_shape = (6, 20, 60)
+    output_shape = (1,)
+    batch_size = 128
+    epochs = 1000
+    PRETRAINED = True
+    CHECKPOINT = False
+    # 用GPU
+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+else:
+    input_shape = (None, None, 20, 60)
+    output_shape = (None, None)
+    batch_size = 1
+    epochs = 1000
+    PRETRAINED = False
+    CHECKPOINT = False
+    # 用CPU
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+pretrained_path = "checkpoints/" + str(model_id) + "/best.hdf5"
+checkpoint_path = "checkpoints/" + str(model_id) + "/"
 
 
 def train():
@@ -27,22 +43,31 @@ def train():
     print("gpus", K.tensorflow_backend._get_available_gpus())
 
     # Data
-    data_x, data_y = get_data_from_file('txt')
-    # data_x = data_x[:60000]
-    # data_y = data_y[:60000]
+    data_x, data_y = get_data_from_file('txt', model_id=model_id)
     print("finish read data", len(data_x))
 
     # Split -> Train, Test
-    split_size = int(len(data_x)*0.1)
-    test_x, test_y = data_x[:split_size], data_y[:split_size]
-    train_x, train_y = data_x[split_size:], data_y[split_size:]
+    if model_id == 1:
+        split_size = int(len(data_x)*0.1)
+        test_x, test_y = data_x[:split_size], data_y[:split_size]
+        train_x, train_y = data_x[split_size:], data_y[split_size:]
+    else:
+        data_x, data_y = get_random(data_x, data_y)
+        split_size = int(len(data_x)*0.1)
+        test_x, test_y = data_x[:split_size], data_y[:split_size]
+        train_x, train_y = data_x[split_size:], data_y[split_size:]
+    print("len(train_x), len(test_x)", len(train_x), len(test_x))
 
     # Data Loader
-    train_data_loader = my_data_loader(train_x, train_y, batch_size=batch_size)
-    test_data_loader = my_data_loader(test_x, test_y, batch_size=batch_size)
+    if model_id == 1:
+        train_data_loader = my_data_loader(train_x, train_y, batch_size=batch_size)
+        test_data_loader = my_data_loader(test_x, test_y, batch_size=batch_size)
+    else:
+        train_data_loader = my_data_loader_2(train_x, train_y, batch_size=batch_size)
+        test_data_loader = my_data_loader_2(test_x, test_y, batch_size=1)
 
     # Model
-    model = get_model(input_shape, output_shape)
+    model = get_model(input_shape, output_shape, model_id=model_id)
     if PRETRAINED:
         model.load_weights(pretrained_path)
         print("read pretrained model", pretrained_path)
@@ -54,16 +79,20 @@ def train():
     else:
         print("no checkpoint")
 
-    filepath = 'e{epoch:02d}-f1{val_f1:.2f}'
-    checkpoint = ModelCheckpoint(checkpoint_path+filepath+".hdf5", monitor='val_f1',
-                                 verbose=1, save_best_only=True, mode='max')
+    filepath = 'e-{epoch:02d}_f1-{val_f1:.2f}'
+    # filepath = 'e-{epoch:02d}_acc-{val_loss:.2f}'
+    checkpoint = ModelCheckpoint(checkpoint_path+filepath+".hdf5",
+                                 monitor='val_f1',
+                                 verbose=1,
+                                 save_best_only=True,
+                                 mode='max')
 
-    model.compile(optimizer=optimizers.Adam(lr=0.005), loss=focal_loss(),
-    # model.compile(optimizer=optimizers.Adam(lr=0.005), loss='binary_crossentropy',
-                  metrics=['acc',
-                           precision, recall, f1])
+    model.compile(optimizer=optimizers.Adam(lr=0.0005),
+                  loss={"output": focal_loss(3., 0.5)},
+                  # loss_weights={"output": 0.5},
+                  metrics=['acc', precision, recall, f1])
 
-    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.1, patience=5,
+    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.1, patience=10,
                             verbose=1, mode='max', cooldown=0, min_lr=0)
 
     model.fit_generator(train_data_loader,
@@ -73,11 +102,6 @@ def train():
                         validation_steps=max(1, len(test_x) // batch_size),
                         epochs=epochs)
 
-    # model.fit(x=[train_x[0], train_x[1], train_x[2]], y=train_y,
-    #           validation_data=([test_x[0], test_x[1], test_x[2]], test_y),
-    #           epochs=epochs, batch_size=256, shuffle=True,
-    #           callbacks=[checkpoint, rlu])
-
     return model, test_x
 
 

BIN
BiddingKG/dl/table_head/vocab_word.pk


+ 3 - 2
BiddingKG/dl/test/test4.py

@@ -46,7 +46,7 @@ def test(name,content):
 if __name__=="__main__":
     # filename = "比地_52_79929693.html"
     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
-    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\test12354.txt","r",encoding="utf8").read()
+    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div",id="pcontent"))
     # df_a = {"html":[]}
     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
@@ -75,7 +75,8 @@ if __name__=="__main__":
     # '''
     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
     # print(predict("12", content,"打印机"))
-    print(predict("12", text,"打印机"))
+    # content = codecs.open("D:\\Project\\format_conversion_maxcompute\\result.html", "r",encoding="utf8").read()
+    print(predict("12", content,"打印机"))
     # test(12,content)
     # test(12,text)
     print("takes",time.time()-_time1)

Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików