3 gadi atpakaļ · 6c01e099e3
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,5 @@ node_modules
 
				 /BiddingKG/dl/table_head/train_data/
			
 
				 /BiddingKG/dl/table_head/check_user_result/
			
 
				 /BiddingKG/dl/table_head/checkpoints/
			
 
				+/BiddingKG/dl/table_head/data_new.csv
			
 
				+/BiddingKG/dl/table_head/has_table_no_attach.xlsx
			
--- a/BiddingKG/dl/table_head/best.hdf5
+++ b/BiddingKG/dl/table_head/best.hdf5
--- a/BiddingKG/dl/table_head/models/layer_utils.py
+++ b/BiddingKG/dl/table_head/models/layer_utils.py
@@ -0,0 +1,272 @@
 
				+import os
			
 
				+import sys
			
 
				+import tensorflow as tf
			
 
				+from keras.callbacks import Callback
			
 
				+from keras.layers import Layer, warnings
			
 
				+import numpy as np
			
 
				+sys.path.append(os.path.dirname(__file__))
			
 
				+from pre_process import get_best_padding_size
			
 
				+
			
 
				+
			
 
				+class BatchReshape1(Layer):
			
 
				+    """
			
 
				+    将表格的行列维度合并到Batch维度中
			
 
				+    (batch, rows, cols, character_num, character_embed) -> (batch*rows*cols, character_num, character_embed)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, character_num, character_embed):
			
 
				+        super(BatchReshape1, self).__init__()
			
 
				+        self.character_num = character_num
			
 
				+        self.character_embed = character_embed
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        batch = tf.shape(inputs)[0]
			
 
				+        height = tf.shape(inputs)[1]
			
 
				+        width = tf.shape(inputs)[2]
			
 
				+
			
 
				+        outputs = tf.reshape(inputs, (batch*height*width,
			
 
				+                                      self.character_num, self.character_embed))
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, self.character_num, self.character_embed
			
 
				+
			
 
				+
			
 
				+class BatchReshape2(Layer):
			
 
				+    """
			
 
				+    将Batch维度中的行列拆分出来
			
 
				+    (batch*rows*cols, cell_embed) -> (batch, rows, cols, cell_embed)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, cell_embed):
			
 
				+        super(BatchReshape2, self).__init__()
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        input1 = inputs[0]
			
 
				+        input2 = inputs[1]
			
 
				+
			
 
				+        batch = tf.shape(input1)[0]
			
 
				+        height = tf.shape(input1)[1]
			
 
				+        width = tf.shape(input1)[2]
			
 
				+
			
 
				+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, None, self.cell_embed
			
 
				+
			
 
				+
			
 
				+class BatchReshape3(Layer):
			
 
				+    """
			
 
				+    将表格的行维度合并到Batch维度中
			
 
				+    (batch, rows, cols, cell_embed) -> (batch*rows, cols, cell_embed)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, cell_embed):
			
 
				+        super(BatchReshape3, self).__init__()
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        batch = tf.shape(inputs)[0]
			
 
				+        height = tf.shape(inputs)[1]
			
 
				+        width = tf.shape(inputs)[2]
			
 
				+
			
 
				+        outputs = tf.reshape(inputs, (batch*height, width, self.cell_embed))
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, self.cell_embed
			
 
				+
			
 
				+
			
 
				+class BatchReshape4(Layer):
			
 
				+    """
			
 
				+    将Batch维度中的行拆出来
			
 
				+    (batch*rows, cols, cell_embed) -> (batch, rows, cols, cell_embed)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, cell_embed):
			
 
				+        super(BatchReshape4, self).__init__()
			
 
				+        self.supports_masking = True
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def compute_mask(self, inputs, mask=None):
			
 
				+        print(mask)
			
 
				+        # if mask[0] is None:
			
 
				+        #     return mask
			
 
				+
			
 
				+        # input1 = inputs[0]
			
 
				+        # input2 = inputs[1]
			
 
				+        # batch = tf.shape(input1)[0]
			
 
				+        # height = tf.shape(input1)[1]
			
 
				+        # width = tf.shape(input1)[2]
			
 
				+        #
			
 
				+        # mask_tensor = tf.reshape(mask[1], (batch, height, width, self.cell_embed))
			
 
				+        return mask
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        input1 = inputs[0]
			
 
				+        input2 = inputs[1]
			
 
				+
			
 
				+        batch = tf.shape(input1)[0]
			
 
				+        height = tf.shape(input1)[1]
			
 
				+        width = tf.shape(input1)[2]
			
 
				+
			
 
				+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, None, self.cell_embed
			
 
				+
			
 
				+
			
 
				+class BatchReshape5(Layer):
			
 
				+    """
			
 
				+    将表格的行维度合并到Batch维度中
			
 
				+    (batch, rows, cols, cell_embed) -> (batch, rows*cols, cell_embed)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, cell_embed):
			
 
				+        super(BatchReshape5, self).__init__()
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        batch = tf.shape(inputs)[0]
			
 
				+        height = tf.shape(inputs)[1]
			
 
				+        width = tf.shape(inputs)[2]
			
 
				+
			
 
				+        outputs = tf.reshape(inputs, (batch, height*width, self.cell_embed))
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, self.cell_embed
			
 
				+
			
 
				+
			
 
				+class BatchReshape6(Layer):
			
 
				+    """
			
 
				+    将Batch维度中的行拆出来
			
 
				+    (batch, rows*cols, cell_embed) -> (batch, rows, cols, cell_embed)
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, cell_embed):
			
 
				+        super(BatchReshape6, self).__init__()
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        input1 = inputs[0]
			
 
				+        input2 = inputs[1]
			
 
				+
			
 
				+        batch = tf.shape(input1)[0]
			
 
				+        height = tf.shape(input1)[1]
			
 
				+        width = tf.shape(input1)[2]
			
 
				+
			
 
				+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, None, self.cell_embed
			
 
				+
			
 
				+
			
 
				+class MyPadding(Layer):
			
 
				+    def __init__(self, pad_height, pad_width, cell_embed):
			
 
				+        super(MyPadding, self).__init__()
			
 
				+        self.pad_height = pad_height
			
 
				+        self.pad_width = pad_width
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        batch = tf.shape(inputs)[0]
			
 
				+        height = tf.shape(inputs)[1]
			
 
				+        width = tf.shape(inputs)[2]
			
 
				+
			
 
				+        outputs = tf.pad(inputs, [[0, 0],
			
 
				+                                  [0, self.pad_height - height],
			
 
				+                                  [0, self.pad_width - width],
			
 
				+                                  [0, 0]])
			
 
				+
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, None, self.cell_embed
			
 
				+
			
 
				+
			
 
				+class MySplit(Layer):
			
 
				+    def __init__(self, height, width, **kwargs):
			
 
				+        super(MySplit, self).__init__(**kwargs)
			
 
				+        self.height = height
			
 
				+        self.width = width
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        outputs = inputs[:, 0:self.height, 0:self.width]
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, None
			
 
				+
			
 
				+
			
 
				+class MyModelCheckpoint(Callback):
			
 
				+    def __init__(self, filepath, monitor='val_loss', verbose=0,
			
 
				+                 save_best_only=False, save_weights_only=False,
			
 
				+                 mode='auto', period=1):
			
 
				+        super(MyModelCheckpoint, self).__init__()
			
 
				+        self.monitor = monitor
			
 
				+        self.verbose = verbose
			
 
				+        self.filepath = filepath
			
 
				+        self.save_best_only = save_best_only
			
 
				+        self.save_weights_only = save_weights_only
			
 
				+        self.period = period
			
 
				+        self.epochs_since_last_save = 0
			
 
				+
			
 
				+        if mode not in ['auto', 'min', 'max']:
			
 
				+            warnings.warn('ModelCheckpoint mode %s is unknown, '
			
 
				+                          'fallback to auto mode.' % (mode),
			
 
				+                          RuntimeWarning)
			
 
				+            mode = 'auto'
			
 
				+
			
 
				+        if mode == 'min':
			
 
				+            self.monitor_op = np.less
			
 
				+            self.best = np.Inf
			
 
				+        elif mode == 'max':
			
 
				+            self.monitor_op = np.greater
			
 
				+            self.best = -np.Inf
			
 
				+        else:
			
 
				+            if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):
			
 
				+                self.monitor_op = np.greater
			
 
				+                self.best = -np.Inf
			
 
				+            else:
			
 
				+                self.monitor_op = np.less
			
 
				+                self.best = np.Inf
			
 
				+
			
 
				+    def on_epoch_end(self, epoch, logs=None):
			
 
				+        logs = logs or {}
			
 
				+        self.epochs_since_last_save += 1
			
 
				+        if self.epochs_since_last_save >= self.period:
			
 
				+            self.epochs_since_last_save = 0
			
 
				+            filepath = self.filepath.format(epoch=epoch + 1, **logs)
			
 
				+            if self.save_best_only:
			
 
				+                current = (logs.get(self.monitor[0]) + logs.get(self.monitor[1])) / 2
			
 
				+                if current is None:
			
 
				+                    warnings.warn('Can save best model only with %s available, '
			
 
				+                                  'skipping.' % (self.monitor), RuntimeWarning)
			
 
				+                else:
			
 
				+                    if self.monitor_op(current, self.best):
			
 
				+                        if self.verbose > 0:
			
 
				+                            print('\nEpoch %05d: %s improved from %0.5f to %0.5f,'
			
 
				+                                  ' saving model to %s'
			
 
				+                                  % (epoch + 1, self.monitor, self.best,
			
 
				+                                     current, filepath))
			
 
				+                        self.best = current
			
 
				+                        if self.save_weights_only:
			
 
				+                            self.model.save_weights(filepath, overwrite=True)
			
 
				+                        else:
			
 
				+                            self.model.save(filepath, overwrite=True)
			
 
				+                    else:
			
 
				+                        if self.verbose > 0:
			
 
				+                            print('\nEpoch %05d: %s did not improve from %0.5f' %
			
 
				+                                  (epoch + 1, self.monitor, self.best))
			
 
				+            else:
			
 
				+                if self.verbose > 0:
			
 
				+                    print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
			
 
				+                if self.save_weights_only:
			
 
				+                    self.model.save_weights(filepath, overwrite=True)
			
 
				+                else:
			
 
				+                    self.model.save(filepath, overwrite=True)
			
--- a/BiddingKG/dl/table_head/models/loop_lstm.py
+++ b/BiddingKG/dl/table_head/models/loop_lstm.py
@@ -0,0 +1,232 @@
 
				+import keras
			
 
				+import tensorflow as tf
			
 
				+from keras import models, backend as K
			
 
				+from keras.layers import Layer, Input, Lambda, Concatenate, Dense, LSTM, Bidirectional
			
 
				+from tensorflow.contrib.rnn import LSTMCell
			
 
				+import numpy as np
			
 
				+
			
 
				+from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
			
 
				+from BiddingKG.dl.table_head.models.u_net import u_net_small
			
 
				+
			
 
				+
			
 
				+def attention(inputs, w_omega, b_omega, u_omega, time_major=False):
			
 
				+    if isinstance(inputs, tuple):
			
 
				+        inputs = tf.concat(inputs, 2)
			
 
				+    if time_major:  # (B,T,D) => (T,B,D)
			
 
				+        inputs = tf.transpose(inputs, [1, 0, 2])
			
 
				+    v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)
			
 
				+
			
 
				+    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
			
 
				+    alphas = tf.nn.softmax(vu, name='alphas')  # (B,T) shape
			
 
				+    # the result has (B,D) shape
			
 
				+    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
			
 
				+
			
 
				+    return output, alphas
			
 
				+
			
 
				+
			
 
				+class LoopCell(Layer):
			
 
				+    def __init__(self, hidden_size, attention_size, character_num, character_embed,
			
 
				+                 cell_embed):
			
 
				+        super(LoopCell, self).__init__()
			
 
				+
			
 
				+        # Hyper parameters
			
 
				+        self.hidden_size = hidden_size
			
 
				+        self.attention_size = attention_size
			
 
				+        self.character_num = character_num
			
 
				+        self.character_embed = character_embed
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def build(self, batch_input_shape):
			
 
				+        super(LoopCell, self).build(batch_input_shape)
			
 
				+
			
 
				+        # Trainable parameters
			
 
				+        # Attention
			
 
				+        # self.w_omega = self.add_weight("w_omega", shape=[self.hidden_size*2, self.attention_size],
			
 
				+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
			
 
				+        #                                trainable=True)
			
 
				+        # self.b_omega = self.add_weight("b_omega", shape=[self.attention_size],
			
 
				+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
			
 
				+        #                                trainable=True)
			
 
				+        # self.u_omega = self.add_weight("u_omega", shape=[self.attention_size],
			
 
				+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
			
 
				+        #                                trainable=True)
			
 
				+
			
 
				+        # Bi-LSTM
			
 
				+        # self.forward_cell = LSTMCell(self.hidden_size, forget_bias=1.0, state_is_tuple=True)
			
 
				+        # self.backward_cell = LSTMCell(self.hidden_size, forget_bias=1.0, state_is_tuple=True)
			
 
				+        # self.bi_lism = Bidirectional(LSTM(self.hidden_size, return_sequences=True))
			
 
				+        # self.bi_lism.build(input_shape=(None, self.character_num, self.character_embed))
			
 
				+        # self.trainable_weights += self.bi_lism.trainable_weights
			
 
				+        #
			
 
				+        # self.self_attention = SeqSelfAttention(attention_activation='sigmoid')
			
 
				+        # self.self_attention.build(input_shape=(None, self.character_num, 2*self.hidden_size))
			
 
				+        # self.trainable_weights += self.self_attention.trainable_weights
			
 
				+        # print(self.trainable_weights)
			
 
				+
			
 
				+        # DNN
			
 
				+        # self.w1 = self.add_weight('W1', [2*self.attention_size, self.cell_embed],
			
 
				+        #                           initializer=tf.random_uniform_initializer(-0.25, 0.25),
			
 
				+        #                           trainable=True)
			
 
				+        #
			
 
				+        # self.b1 = self.add_weight('b1', [self.cell_embed],
			
 
				+        #                           initializer=tf.zeros_initializer(),
			
 
				+        #                           trainable=True)
			
 
				+        # self.dense = Dense(self.cell_embed, activation="relu")
			
 
				+        # print(batch_input_shape[0], batch_input_shape[1], batch_input_shape[2])
			
 
				+        # self.dense.build(input_shape=(batch_input_shape[0]*batch_input_shape[1]*batch_input_shape[2],
			
 
				+        #                               2*self.attention_size))
			
 
				+        # self.trainable_weights += self.dense.trainable_weights
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        def fn(x):
			
 
				+            print("fn_0", x)
			
 
				+
			
 
				+            # (batch*height*width, character_num, hidden_size)
			
 
				+            # outputs, last_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=self.forward_cell,
			
 
				+            #                                                        cell_bw=self.backward_cell,
			
 
				+            #                                                        inputs=x,
			
 
				+            #                                                        dtype=tf.float32,
			
 
				+            #                                                        time_major=False)
			
 
				+            # (batch*height*width, character_num, 2*hidden_size)
			
 
				+            # outputs = self.bi_lism(x)
			
 
				+            # print("fn_1", outputs)
			
 
				+
			
 
				+            # (batch*height*width, character_num, 2*hidden_size)
			
 
				+            # outputs = self.self_attention(outputs)
			
 
				+            # print("fn_2", outputs)
			
 
				+
			
 
				+            # (batch*height*width, 2*hidden_size)
			
 
				+            # outputs, _ = attention(outputs, self.w_omega, self.b_omega,
			
 
				+            #                        self.u_omega, time_major=False)
			
 
				+
			
 
				+
			
 
				+            # (batch*height*width, cell_embedding)
			
 
				+            # outputs = tf.nn.xw_plus_b(outputs, self.w1, self.b1)
			
 
				+            # outputs = self.dense(outputs)
			
 
				+            # print("fn_3", outputs)
			
 
				+            return outputs
			
 
				+
			
 
				+        batch = tf.shape(inputs)[0]
			
 
				+        height = tf.shape(inputs)[1]
			
 
				+        width = tf.shape(inputs)[2]
			
 
				+
			
 
				+        # (batch, height*width, character_num(time_step), character_embedding)
			
 
				+        # inputs = tf.reshape(inputs, (tf.shape(inputs)[0],
			
 
				+        #                              height*width,
			
 
				+        #                              inputs.shape[3], inputs.shape[4]))
			
 
				+
			
 
				+        # (batch*height*width, character_num, character_embedding)
			
 
				+        outputs = tf.reshape(inputs, (batch*height*width,
			
 
				+                                      inputs.shape[3], inputs.shape[4]))
			
 
				+
			
 
				+        # (height*width, batch, character_num(time_step), character_embedding)
			
 
				+        # inputs = tf.transpose(inputs, (1, 0, 2, 3))
			
 
				+
			
 
				+        # split height*width, each cell
			
 
				+        # (height*width, batch, cell_embedding)
			
 
				+        # outputs = tf.map_fn(fn=lambda x: fn(x), elems=inputs, dtype=tf.float32)
			
 
				+        # print("loop_lstm_1", outputs)
			
 
				+        # outputs = tf.squeeze(outputs, 0)
			
 
				+
			
 
				+        # (batch*height*width, 2*attention_size)
			
 
				+        # outputs = fn(inputs)
			
 
				+        # print("loop_lstm_2", outputs)
			
 
				+
			
 
				+        # (1, batch*height*width, 2*attention_size)
			
 
				+        # outputs = tf.expand_dims(outputs, 0)
			
 
				+        # print("loop_lstm_3", outputs)
			
 
				+
			
 
				+        # (batch*height*width, cell_embedding)
			
 
				+        # outputs = Dense(self.cell_embed, activation="relu")(outputs)
			
 
				+        # print("loop_lstm_3", outputs)
			
 
				+
			
 
				+        # (batch, height*width, cell_embedding)
			
 
				+        # outputs = tf.transpose(outputs, (1, 0, 2))
			
 
				+        # print("loop_lstm_2", outputs)
			
 
				+
			
 
				+        # (batch, height, width, cell_embedding)
			
 
				+        # outputs = tf.reshape(outputs, (batch, height, width, self.cell_embed))
			
 
				+        # print("loop_lstm_4", outputs)
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, self.character_num, self.character_embed
			
 
				+
			
 
				+
			
 
				+class BatchReshape(Layer):
			
 
				+    def __init__(self, cell_embed):
			
 
				+        super(BatchReshape, self).__init__()
			
 
				+        self.cell_embed = cell_embed
			
 
				+
			
 
				+    def call(self, inputs, mask=None, **kwargs):
			
 
				+        input1 = inputs[0]
			
 
				+        input2 = inputs[1]
			
 
				+
			
 
				+        batch = tf.shape(input1)[0]
			
 
				+        height = tf.shape(input1)[1]
			
 
				+        width = tf.shape(input1)[2]
			
 
				+
			
 
				+        # (batch, height, width, cell_embedding)
			
 
				+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
			
 
				+        print("batch_reshape", outputs)
			
 
				+        return outputs
			
 
				+
			
 
				+    def compute_output_shape(self, input_shape):
			
 
				+        return None, None, None, self.cell_embed
			
 
				+
			
 
				+
			
 
				+# def batch_reshape(x):
			
 
				+#     return K.reshape(x, (batch, height, width, cell_embed))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    input_shape = (16, 8, 10, 60)
			
 
				+    hidden_size = 64
			
 
				+    attention_size = 64
			
 
				+    character_num = 10
			
 
				+    character_embed = 60
			
 
				+    cell_embed = 8
			
 
				+
			
 
				+    # (batch_size, row_num, col_num, character_num, character_embedding)
			
 
				+    X_train = np.random.uniform(0, 1, (10, 16, 8, 10, 60))
			
 
				+    X_test = np.random.uniform(0, 1, (10, 16, 8, 10, 60))
			
 
				+    y_train = np.random.uniform(0, 1, (10, 16, 8))
			
 
				+    y_test = np.random.uniform(0, 1, (10, 16, 8))
			
 
				+
			
 
				+    _input = Input(shape=input_shape, dtype="float32")
			
 
				+    batch = K.shape(_input)[0]
			
 
				+    height = K.shape(_input)[1]
			
 
				+    width = K.shape(_input)[2]
			
 
				+    print(batch, height, width)
			
 
				+
			
 
				+    loop_bi_lstm = LoopCell(hidden_size, attention_size,
			
 
				+                            character_num, character_embed,
			
 
				+                            cell_embed)(_input)
			
 
				+    print("model_2_1", loop_bi_lstm)
			
 
				+    dense = Dense(cell_embed, activation="relu")(loop_bi_lstm)
			
 
				+    print("model_2_2", dense)
			
 
				+    reshape = Lambda(batch_reshape, output_shape=(height, width, cell_embed))(dense)
			
 
				+    print("model_2_3", reshape)
			
 
				+    u_net = u_net_small(loop_bi_lstm)
			
 
				+    merge = Concatenate(axis=-1)([loop_bi_lstm, u_net])
			
 
				+    dense = Dense(LoopCell().cell_embed, activation='relu')(merge)
			
 
				+    dense = Dense(1, activation='sigmoid')(dense)
			
 
				+    squeeze = Lambda(lambda x: K.squeeze(x, axis=-1))(dense)
			
 
				+    model = models.Model(inputs=_input, outputs=squeeze)
			
 
				+    model.summary(line_length=120)
			
 
				+    model.compile(loss='binary_crossentropy', optimizer='adam')
			
 
				+    model.fit(X_train, y_train,
			
 
				+              epochs=2,
			
 
				+              batch_size=1,
			
 
				+              validation_data=(X_test, y_test))
			
 
				+
			
 
				+    # (batch_size, row_num, col_num, character_num, character_embedding)
			
 
				+    X_train = np.random.uniform(0, 1, (5, 32, 24, 10, 60))
			
 
				+    X_test = np.random.uniform(0, 1, (5, 32, 24, 10, 60))
			
 
				+    y_train = np.random.uniform(0, 1, (5, 32, 24))
			
 
				+    y_test = np.random.uniform(0, 1, (5, 32, 24))
			
 
				+
			
 
				+    model.fit(X_train, y_train,
			
 
				+              epochs=2,
			
 
				+              batch_size=1,
			
 
				+              validation_data=(X_test, y_test))
			
--- a/BiddingKG/dl/table_head/models/model_2.py
+++ b/BiddingKG/dl/table_head/models/model_2.py
@@ -0,0 +1,58 @@
 
				+import sys
			
 
				+import os
			
 
				+sys.path.append(os.path.abspath("../.."))
			
 
				+from keras import layers, models
			
 
				+import tensorflow as tf
			
 
				+from BiddingKG.dl.table_head.models.my_average_pooling import MyAveragePooling1D
			
 
				+from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
			
 
				+
			
 
				+
			
 
				+def get_model(input_shape, output_shape):
			
 
				+    # Input
			
 
				+    input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
			
 
				+    input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
			
 
				+    input_3 = layers.Input(shape=input_shape[1:], dtype="float32")
			
 
				+    input_4 = layers.Input(shape=input_shape[1:], dtype="float32")
			
 
				+    input_5 = layers.Input(shape=input_shape[1:], dtype="float32")
			
 
				+    input_6 = layers.Input(shape=input_shape[1:], dtype="float32")
			
 
				+
			
 
				+    # Bi-LSTM
			
 
				+    bi_lstm_1 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_1)
			
 
				+    bi_lstm_2 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_2)
			
 
				+    bi_lstm_3 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_3)
			
 
				+    bi_lstm_4 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_4)
			
 
				+    bi_lstm_5 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_5)
			
 
				+    bi_lstm_6 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_6)
			
 
				+
			
 
				+    # Self-Attention
			
 
				+    self_attention_1 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_1)
			
 
				+    self_attention_2 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_2)
			
 
				+    self_attention_3 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_3)
			
 
				+    self_attention_4 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_4)
			
 
				+    self_attention_5 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_5)
			
 
				+    self_attention_6 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_6)
			
 
				+
			
 
				+    # Concat
			
 
				+    concat_1 = layers.concatenate([self_attention_1, self_attention_2, self_attention_3])
			
 
				+    concat_2 = layers.concatenate([self_attention_4, self_attention_5, self_attention_6])
			
 
				+
			
 
				+    # Dense + Sigmoid
			
 
				+    dense_1 = layers.Dense(output_shape[0], activation="sigmoid")(concat_1)
			
 
				+    dense_2 = layers.Dense(output_shape[0], activation="sigmoid")(concat_2)
			
 
				+
			
 
				+    # mask mean pooling
			
 
				+    pool_1 = MyAveragePooling1D(axis=1)(dense_1)
			
 
				+    pool_2 = MyAveragePooling1D(axis=1)(dense_2)
			
 
				+
			
 
				+    # Concat
			
 
				+    concat = layers.concatenate([pool_1, pool_2])
			
 
				+
			
 
				+    # Dense
			
 
				+    output = layers.Dense(10)(concat)
			
 
				+    output = layers.Dense(1, activation="sigmoid", name='output')(output)
			
 
				+
			
 
				+    model = models.Model(inputs=[input_1, input_2, input_3, input_4, input_5, input_6],
			
 
				+                         outputs=output)
			
 
				+
			
 
				+    model.summary()
			
 
				+    return model
			
--- a/BiddingKG/dl/table_head/models/tf_bi_lstm.py
+++ b/BiddingKG/dl/table_head/models/tf_bi_lstm.py
@@ -0,0 +1,49 @@
 
				+import tensorflow as tf
			
 
				+from tensorflow.contrib.rnn import LSTMCell
			
 
				+from tensorflow.contrib.rnn import MultiRNNCell
			
 
				+
			
 
				+
			
 
				+class LstmBase:
			
 
				+    """
			
 
				+    build rnn cell
			
 
				+    """
			
 
				+    def build_rnn(self, hidden_size, num_layes):
			
 
				+        cells = []
			
 
				+        for i in range(num_layes):
			
 
				+            cell = LSTMCell(num_units=hidden_size,
			
 
				+                            state_is_tuple=True,
			
 
				+                            initializer=tf.random_uniform_initializer(-0.25, 0.25))
			
 
				+            cells.append(cell)
			
 
				+        cells = MultiRNNCell(cells, state_is_tuple=True)
			
 
				+
			
 
				+        return cells
			
 
				+
			
 
				+
			
 
				+class BiLstm(LstmBase):
			
 
				+    """
			
 
				+    define the lstm
			
 
				+    """
			
 
				+    def __init__(self, scope_name, hidden_size, num_layers):
			
 
				+        super(BiLstm, self).__init__()
			
 
				+        assert hidden_size % 2 == 0
			
 
				+        hidden_size /= 2
			
 
				+
			
 
				+        self.fw_rnns = []
			
 
				+        self.bw_rnns = []
			
 
				+        for i in range(num_layers):
			
 
				+            self.fw_rnns.append(self.build_rnn(hidden_size, 1))
			
 
				+            self.bw_rnns.append(self.build_rnn(hidden_size, 1))
			
 
				+
			
 
				+        self.scope_name = scope_name
			
 
				+
			
 
				+    def __call__(self, input, input_len):
			
 
				+        for idx, (fw_rnn, bw_rnn) in enumerate(zip(self.fw_rnns, self.bw_rnns)):
			
 
				+            scope_name = '{}_{}'.format(self.scope_name, idx)
			
 
				+            ctx, _ = tf.nn.bidirectional_dynamic_rnn(
			
 
				+                fw_rnn, bw_rnn, input, sequence_length=input_len,
			
 
				+                dtype=tf.float32, time_major=False,
			
 
				+                scope=scope_name
			
 
				+            )
			
 
				+            input = tf.concat(ctx, -1)
			
 
				+        ctx = input
			
 
				+        return ctx
			
--- a/BiddingKG/dl/table_head/models/u_net.py
+++ b/BiddingKG/dl/table_head/models/u_net.py
@@ -0,0 +1,82 @@
 
				+from keras.layers import Input, concatenate, Conv2D, MaxPooling2D, BatchNormalization, UpSampling2D
			
 
				+from keras.layers import LeakyReLU
			
 
				+
			
 
				+
			
 
				+def u_net_small(inputs, num_classes=1):
			
 
				+    # 8
			
 
				+    use_bias = False
			
 
				+    down0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(inputs)
			
 
				+    down0 = BatchNormalization()(down0)
			
 
				+    down0 = LeakyReLU(alpha=0.)(down0)
			
 
				+    down0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(down0)
			
 
				+    down0 = BatchNormalization()(down0)
			
 
				+    down0 = LeakyReLU(alpha=0.)(down0)
			
 
				+    down0_pool = MaxPooling2D((2, 2), strides=(2, 2))(down0)
			
 
				+
			
 
				+    # 4
			
 
				+    down1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(down0_pool)
			
 
				+    down1 = BatchNormalization()(down1)
			
 
				+    down1 = LeakyReLU(alpha=0.)(down1)
			
 
				+    down1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(down1)
			
 
				+    down1 = BatchNormalization()(down1)
			
 
				+    down1 = LeakyReLU(alpha=0.)(down1)
			
 
				+    down1_pool = MaxPooling2D((2, 2), strides=(2, 2))(down1)
			
 
				+
			
 
				+    # 2
			
 
				+    down2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(down1_pool)
			
 
				+    down2 = BatchNormalization()(down2)
			
 
				+    down2 = LeakyReLU(alpha=0.)(down2)
			
 
				+    down2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(down2)
			
 
				+    down2 = BatchNormalization()(down2)
			
 
				+    down2 = LeakyReLU(alpha=0.)(down2)
			
 
				+    down2_pool = MaxPooling2D((2, 2), strides=(2, 2))(down2)
			
 
				+
			
 
				+    center = Conv2D(64, (3, 3), padding='same', use_bias=use_bias)(down2_pool)
			
 
				+    center = BatchNormalization()(center)
			
 
				+    center = LeakyReLU(alpha=0.)(center)
			
 
				+    center = Conv2D(64, (3, 3), padding='same', use_bias=use_bias)(center)
			
 
				+    center = BatchNormalization()(center)
			
 
				+    center = LeakyReLU(alpha=0.)(center)
			
 
				+
			
 
				+    # 2
			
 
				+    up2 = UpSampling2D((2, 2))(center)
			
 
				+    up2 = concatenate([down2, up2], axis=3)
			
 
				+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
			
 
				+    up2 = BatchNormalization()(up2)
			
 
				+    up2 = LeakyReLU(alpha=0.)(up2)
			
 
				+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
			
 
				+    up2 = BatchNormalization()(up2)
			
 
				+    up2 = LeakyReLU(alpha=0.)(up2)
			
 
				+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
			
 
				+    up2 = BatchNormalization()(up2)
			
 
				+    up2 = LeakyReLU(alpha=0.)(up2)
			
 
				+
			
 
				+    # 4
			
 
				+    up1 = UpSampling2D((2, 2))(up2)
			
 
				+    up1 = concatenate([down1, up1], axis=3)
			
 
				+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
			
 
				+    up1 = BatchNormalization()(up1)
			
 
				+    up1 = LeakyReLU(alpha=0.)(up1)
			
 
				+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
			
 
				+    up1 = BatchNormalization()(up1)
			
 
				+    up1 = LeakyReLU(alpha=0.)(up1)
			
 
				+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
			
 
				+    up1 = BatchNormalization()(up1)
			
 
				+    up1 = LeakyReLU(alpha=0.)(up1)
			
 
				+
			
 
				+    # 8
			
 
				+    up0 = UpSampling2D((2, 2))(up1)
			
 
				+    up0 = concatenate([down0, up0], axis=3)
			
 
				+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
			
 
				+    up0 = BatchNormalization()(up0)
			
 
				+    up0 = LeakyReLU(alpha=0.)(up0)
			
 
				+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
			
 
				+    up0 = BatchNormalization()(up0)
			
 
				+    up0 = LeakyReLU(alpha=0.)(up0)
			
 
				+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
			
 
				+    up0 = BatchNormalization()(up0)
			
 
				+    up0 = LeakyReLU(alpha=0.)(up0)
			
 
				+
			
 
				+    # classify
			
 
				+    # classify = Conv2D(num_classes, (1, 1), activation='sigmoid')(up0)
			
 
				+    return up0
			
--- a/BiddingKG/dl/table_head/preprocessing_test.py
+++ b/BiddingKG/dl/table_head/preprocessing_test.py
@@ -0,0 +1,17 @@
 
				+import codecs
			
 
				+import pandas as pd
			
 
				+from bs4 import BeautifulSoup
			
 
				+from BiddingKG.dl.interface.extract import predict
			
 
				+
			
 
				+
			
 
				+def test():
			
 
				+    df = pd.read_excel("has_table_no_attach.xlsx")
			
 
				+    for index, row in df.iterrows():
			
 
				+        if index % 100 == 0:
			
 
				+            print("Loop", index)
			
 
				+        text = row['dochtmlcon']
			
 
				+        predict(str(index), text)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    test()
			
--- a/BiddingKG/dl/table_head/table_simplify.py
+++ b/BiddingKG/dl/table_head/table_simplify.py
@@ -0,0 +1,188 @@
 
				+#coding:utf-8
			
 
				+import json
			
 
				+import logging
			
 
				+
			
 
				+from BiddingKG.dl.table_head.pre_process import postgresql_util
			
 
				+
			
 
				+
			
 
				+user_score = {
			
 
				+    "test": 1.,
			
 
				+    "test1": 0.83,
			
 
				+    "test11": 0.82,
			
 
				+    "test12": 0.74,
			
 
				+    "test16": 0.83,
			
 
				+    "test17": 0.77,
			
 
				+    "test19": 0.79,
			
 
				+    "test20": 0.82,
			
 
				+    "test21": 0.73,
			
 
				+    "test22": 0.64,
			
 
				+    "test25": 0.77,
			
 
				+    "test26": 0.80,
			
 
				+    "test27": 0.72,
			
 
				+    "test29": 0.8,
			
 
				+    "test3": 0.,
			
 
				+    "test7": 0.82,
			
 
				+    "test8": 0.78,
			
 
				+    "test9": 0.80,
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def get_labeled_table():
			
 
				+    sql = """
			
 
				+    select id, update_user, table_text, pre_label, post_label
			
 
				+    from label_table_head_info where status = 0
			
 
				+    """
			
 
				+
			
 
				+    result_list = postgresql_util(sql, limit=1000000)
			
 
				+    print("len(result_list)", len(result_list))
			
 
				+    with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
			
 
				+        not_eval_table_list = f.read()
			
 
				+    not_eval_table_list = eval(not_eval_table_list)
			
 
				+
			
 
				+    table_list = []
			
 
				+    # not_eval_table_list = []
			
 
				+    for table in result_list:
			
 
				+        pre_label = eval(table[3])
			
 
				+        post_label = eval(table[4])
			
 
				+        _id = table[0]
			
 
				+        update_user = table[1]
			
 
				+        table_text = table[2]
			
 
				+        if _id in not_eval_table_list:
			
 
				+            continue
			
 
				+
			
 
				+        try:
			
 
				+            if table_text[0] == '"':
			
 
				+                table_text = eval(table_text)
			
 
				+            else:
			
 
				+                table_text = table_text
			
 
				+            table_text = table_text.replace('\\', '/')
			
 
				+            table_text = eval(table_text)
			
 
				+        except:
			
 
				+            print("无法识别table_text", _id)
			
 
				+            not_eval_table_list.append(_id)
			
 
				+            continue
			
 
				+
			
 
				+        if post_label:
			
 
				+            label_list = post_label
			
 
				+        else:
			
 
				+            label_list = pre_label
			
 
				+
			
 
				+        table_list.append([table_text, label_list, update_user, _id])
			
 
				+    print("len(table_list)", len(table_list))
			
 
				+    # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "w") as f:
			
 
				+    #     f.write(str(not_eval_table_list))
			
 
				+    return table_list
			
 
				+
			
 
				+
			
 
				+def table_distance(table1, table2, thresh=0.85):
			
 
				+    # flatten
			
 
				+    table1 = [col for row in table1 for col in row]
			
 
				+    table2 = [col for row in table2 for col in row]
			
 
				+    while "" in table1:
			
 
				+        table1.remove("")
			
 
				+    while "" in table2:
			
 
				+        table2.remove("")
			
 
				+
			
 
				+    equal_cnt = 0
			
 
				+    not_equal_cnt = 0
			
 
				+    equal_flag = 0
			
 
				+    for col1 in table1:
			
 
				+        find_flag = 0
			
 
				+        for col2 in table2:
			
 
				+            if col1 == col2:
			
 
				+                equal_cnt += 1
			
 
				+                find_flag = 1
			
 
				+                break
			
 
				+        if not find_flag:
			
 
				+            not_equal_cnt += 1
			
 
				+        # print(equal_cnt, not_equal_cnt)
			
 
				+        if round(equal_cnt / max(len(table1), len(table2)), 2) >= thresh:
			
 
				+            # print("> thresh")
			
 
				+            equal_flag = 1
			
 
				+            break
			
 
				+        if round(not_equal_cnt / max(len(table1), len(table2)), 2) >= 1-thresh:
			
 
				+            # print("> 1-thresh")
			
 
				+            equal_flag = 0
			
 
				+            break
			
 
				+    return equal_flag
			
 
				+
			
 
				+
			
 
				+def remove_duplicate(table_list):
			
 
				+    logging.info("into remove_duplicate")
			
 
				+    table_list.sort(key=lambda x: x[0])
			
 
				+    delete_table_id_list = []
			
 
				+    for i in range(len(table_list)):
			
 
				+        delete_table_id_list = list(set(delete_table_id_list))
			
 
				+        if i % 1000 == 0:
			
 
				+            print("Loop", i, "len(delete_table_id_list)", len(delete_table_id_list))
			
 
				+            logging.info("*")
			
 
				+            with open(r"C:\Users\Administrator\Desktop\table_delete.txt", "w") as f:
			
 
				+                f.write(str(delete_table_id_list))
			
 
				+        table1 = table_list[i]
			
 
				+        if len(table1[0]) <= 2 and len(table1[0][0]) <= 2:
			
 
				+            delete_table_id_list.append(table1[3])
			
 
				+            continue
			
 
				+        for j in range(i+1, len(table_list)):
			
 
				+            table2 = table_list[j]
			
 
				+            if len(table2[0]) <= 2 and len(table2[0][0]) <= 2:
			
 
				+                delete_table_id_list.append(table2[3])
			
 
				+                continue
			
 
				+            # 行数相差2以上忽略
			
 
				+            if abs(len(table1[0]) - len(table2[0])) >= 2:
			
 
				+                continue
			
 
				+            # 列数相差2以上忽略
			
 
				+            if abs(len(table1[0][0])) - len(table2[0][0]) >= 2:
			
 
				+                continue
			
 
				+            if table_distance(table1[0], table2[0]):
			
 
				+                print("equal", table1[3], table2[3])
			
 
				+                score1 = user_score.get(table1[2])
			
 
				+                score2 = user_score.get(table2[2])
			
 
				+                if score1 is None:
			
 
				+                    score1 = 0.
			
 
				+                if score2 is None:
			
 
				+                    score2 = 0.
			
 
				+                if score1 >= score2:
			
 
				+                    delete_table_id_list.append(table2[3])
			
 
				+                else:
			
 
				+                    delete_table_id_list.append(table1[3])
			
 
				+
			
 
				+    delete_table_id_list = list(set(delete_table_id_list))
			
 
				+    new_table_list = []
			
 
				+    for table in table_list:
			
 
				+        if table[3] not in delete_table_id_list:
			
 
				+            new_table_list.append(table)
			
 
				+    return new_table_list
			
 
				+
			
 
				+
			
 
				+def eval_table(_str):
			
 
				+    try:
			
 
				+        if _str[0] == '"':
			
 
				+            table_text = eval(_str)
			
 
				+        else:
			
 
				+            table_text = _str
			
 
				+        table_text = table_text.replace('\\', '/')
			
 
				+        table_text = eval(table_text)
			
 
				+    except:
			
 
				+        print("无法识别table_text")
			
 
				+        table_text = ""
			
 
				+    return table_text
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    _list = get_labeled_table()
			
 
				+    _list = remove_duplicate(_list)
			
 
				+    _str = json.dumps(str(_list))
			
 
				+    with open(r"C:\Users\Administrator\Desktop\table_simplify.txt", "w") as f:
			
 
				+        f.write(_str)
			
 
				+
			
 
				+    # _str1 = "[['', '', 'Yes']]"
			
 
				+    # _str2 = "[['', '', 'Yes', '']]"
			
 
				+    # table1 = eval_table(_str1)
			
 
				+    # table2 = eval_table(_str2)
			
 
				+    #
			
 
				+    # print(table_distance(table1, table2))
			
 
				+
			
 
				+    # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
			
 
				+    #     not_eval_table_list = f.read()
			
 
				+    # print(not_eval_table_list)
			
 
				+    # not_eval_table_list = eval(not_eval_table_list)
			
--- a/BiddingKG/dl/table_head/vocab_word.pk
+++ b/BiddingKG/dl/table_head/vocab_word.pk