Pārlūkot izejas kodu

更新使用模型识别表头

fangjiasheng 3 gadi atpakaļ
vecāks
revīzija
6c01e099e3

+ 2 - 0
.gitignore

@@ -13,3 +13,5 @@ node_modules
 /BiddingKG/dl/table_head/train_data/
 /BiddingKG/dl/table_head/check_user_result/
 /BiddingKG/dl/table_head/checkpoints/
+/BiddingKG/dl/table_head/data_new.csv
+/BiddingKG/dl/table_head/has_table_no_attach.xlsx

BIN
BiddingKG/dl/table_head/best.hdf5


+ 272 - 0
BiddingKG/dl/table_head/models/layer_utils.py

@@ -0,0 +1,272 @@
+import os
+import sys
+import tensorflow as tf
+from keras.callbacks import Callback
+from keras.layers import Layer, warnings
+import numpy as np
+sys.path.append(os.path.dirname(__file__))
+from pre_process import get_best_padding_size
+
+
+class BatchReshape1(Layer):
+    """
+    将表格的行列维度合并到Batch维度中
+    (batch, rows, cols, character_num, character_embed) -> (batch*rows*cols, character_num, character_embed)
+    """
+
+    def __init__(self, character_num, character_embed):
+        super(BatchReshape1, self).__init__()
+        self.character_num = character_num
+        self.character_embed = character_embed
+
+    def call(self, inputs, mask=None, **kwargs):
+        batch = tf.shape(inputs)[0]
+        height = tf.shape(inputs)[1]
+        width = tf.shape(inputs)[2]
+
+        outputs = tf.reshape(inputs, (batch*height*width,
+                                      self.character_num, self.character_embed))
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, self.character_num, self.character_embed
+
+
+class BatchReshape2(Layer):
+    """
+    将Batch维度中的行列拆分出来
+    (batch*rows*cols, cell_embed) -> (batch, rows, cols, cell_embed)
+    """
+
+    def __init__(self, cell_embed):
+        super(BatchReshape2, self).__init__()
+        self.cell_embed = cell_embed
+
+    def call(self, inputs, mask=None, **kwargs):
+        input1 = inputs[0]
+        input2 = inputs[1]
+
+        batch = tf.shape(input1)[0]
+        height = tf.shape(input1)[1]
+        width = tf.shape(input1)[2]
+
+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, None, self.cell_embed
+
+
+class BatchReshape3(Layer):
+    """
+    将表格的行维度合并到Batch维度中
+    (batch, rows, cols, cell_embed) -> (batch*rows, cols, cell_embed)
+    """
+
+    def __init__(self, cell_embed):
+        super(BatchReshape3, self).__init__()
+        self.cell_embed = cell_embed
+
+    def call(self, inputs, mask=None, **kwargs):
+        batch = tf.shape(inputs)[0]
+        height = tf.shape(inputs)[1]
+        width = tf.shape(inputs)[2]
+
+        outputs = tf.reshape(inputs, (batch*height, width, self.cell_embed))
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, self.cell_embed
+
+
+class BatchReshape4(Layer):
+    """
+    将Batch维度中的行拆出来
+    (batch*rows, cols, cell_embed) -> (batch, rows, cols, cell_embed)
+    """
+
+    def __init__(self, cell_embed):
+        super(BatchReshape4, self).__init__()
+        self.supports_masking = True
+        self.cell_embed = cell_embed
+
+    def compute_mask(self, inputs, mask=None):
+        print(mask)
+        # if mask[0] is None:
+        #     return mask
+
+        # input1 = inputs[0]
+        # input2 = inputs[1]
+        # batch = tf.shape(input1)[0]
+        # height = tf.shape(input1)[1]
+        # width = tf.shape(input1)[2]
+        #
+        # mask_tensor = tf.reshape(mask[1], (batch, height, width, self.cell_embed))
+        return mask
+
+    def call(self, inputs, mask=None, **kwargs):
+        input1 = inputs[0]
+        input2 = inputs[1]
+
+        batch = tf.shape(input1)[0]
+        height = tf.shape(input1)[1]
+        width = tf.shape(input1)[2]
+
+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, None, self.cell_embed
+
+
+class BatchReshape5(Layer):
+    """
+    将表格的行维度合并到Batch维度中
+    (batch, rows, cols, cell_embed) -> (batch, rows*cols, cell_embed)
+    """
+
+    def __init__(self, cell_embed):
+        super(BatchReshape5, self).__init__()
+        self.cell_embed = cell_embed
+
+    def call(self, inputs, mask=None, **kwargs):
+        batch = tf.shape(inputs)[0]
+        height = tf.shape(inputs)[1]
+        width = tf.shape(inputs)[2]
+
+        outputs = tf.reshape(inputs, (batch, height*width, self.cell_embed))
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, self.cell_embed
+
+
+class BatchReshape6(Layer):
+    """
+    将Batch维度中的行拆出来
+    (batch, rows*cols, cell_embed) -> (batch, rows, cols, cell_embed)
+    """
+
+    def __init__(self, cell_embed):
+        super(BatchReshape6, self).__init__()
+        self.cell_embed = cell_embed
+
+    def call(self, inputs, mask=None, **kwargs):
+        input1 = inputs[0]
+        input2 = inputs[1]
+
+        batch = tf.shape(input1)[0]
+        height = tf.shape(input1)[1]
+        width = tf.shape(input1)[2]
+
+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, None, self.cell_embed
+
+
+class MyPadding(Layer):
+    def __init__(self, pad_height, pad_width, cell_embed):
+        super(MyPadding, self).__init__()
+        self.pad_height = pad_height
+        self.pad_width = pad_width
+        self.cell_embed = cell_embed
+
+    def call(self, inputs, mask=None, **kwargs):
+        batch = tf.shape(inputs)[0]
+        height = tf.shape(inputs)[1]
+        width = tf.shape(inputs)[2]
+
+        outputs = tf.pad(inputs, [[0, 0],
+                                  [0, self.pad_height - height],
+                                  [0, self.pad_width - width],
+                                  [0, 0]])
+
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, None, self.cell_embed
+
+
+class MySplit(Layer):
+    def __init__(self, height, width, **kwargs):
+        super(MySplit, self).__init__(**kwargs)
+        self.height = height
+        self.width = width
+
+    def call(self, inputs, mask=None, **kwargs):
+        outputs = inputs[:, 0:self.height, 0:self.width]
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, None
+
+
+class MyModelCheckpoint(Callback):
+    def __init__(self, filepath, monitor='val_loss', verbose=0,
+                 save_best_only=False, save_weights_only=False,
+                 mode='auto', period=1):
+        super(MyModelCheckpoint, self).__init__()
+        self.monitor = monitor
+        self.verbose = verbose
+        self.filepath = filepath
+        self.save_best_only = save_best_only
+        self.save_weights_only = save_weights_only
+        self.period = period
+        self.epochs_since_last_save = 0
+
+        if mode not in ['auto', 'min', 'max']:
+            warnings.warn('ModelCheckpoint mode %s is unknown, '
+                          'fallback to auto mode.' % (mode),
+                          RuntimeWarning)
+            mode = 'auto'
+
+        if mode == 'min':
+            self.monitor_op = np.less
+            self.best = np.Inf
+        elif mode == 'max':
+            self.monitor_op = np.greater
+            self.best = -np.Inf
+        else:
+            if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):
+                self.monitor_op = np.greater
+                self.best = -np.Inf
+            else:
+                self.monitor_op = np.less
+                self.best = np.Inf
+
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        self.epochs_since_last_save += 1
+        if self.epochs_since_last_save >= self.period:
+            self.epochs_since_last_save = 0
+            filepath = self.filepath.format(epoch=epoch + 1, **logs)
+            if self.save_best_only:
+                current = (logs.get(self.monitor[0]) + logs.get(self.monitor[1])) / 2
+                if current is None:
+                    warnings.warn('Can save best model only with %s available, '
+                                  'skipping.' % (self.monitor), RuntimeWarning)
+                else:
+                    if self.monitor_op(current, self.best):
+                        if self.verbose > 0:
+                            print('\nEpoch %05d: %s improved from %0.5f to %0.5f,'
+                                  ' saving model to %s'
+                                  % (epoch + 1, self.monitor, self.best,
+                                     current, filepath))
+                        self.best = current
+                        if self.save_weights_only:
+                            self.model.save_weights(filepath, overwrite=True)
+                        else:
+                            self.model.save(filepath, overwrite=True)
+                    else:
+                        if self.verbose > 0:
+                            print('\nEpoch %05d: %s did not improve from %0.5f' %
+                                  (epoch + 1, self.monitor, self.best))
+            else:
+                if self.verbose > 0:
+                    print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
+                if self.save_weights_only:
+                    self.model.save_weights(filepath, overwrite=True)
+                else:
+                    self.model.save(filepath, overwrite=True)

+ 232 - 0
BiddingKG/dl/table_head/models/loop_lstm.py

@@ -0,0 +1,232 @@
+import keras
+import tensorflow as tf
+from keras import models, backend as K
+from keras.layers import Layer, Input, Lambda, Concatenate, Dense, LSTM, Bidirectional
+from tensorflow.contrib.rnn import LSTMCell
+import numpy as np
+
+from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
+from BiddingKG.dl.table_head.models.u_net import u_net_small
+
+
+def attention(inputs, w_omega, b_omega, u_omega, time_major=False):
+    if isinstance(inputs, tuple):
+        inputs = tf.concat(inputs, 2)
+    if time_major:  # (B,T,D) => (T,B,D)
+        inputs = tf.transpose(inputs, [1, 0, 2])
+    v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)
+
+    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
+    alphas = tf.nn.softmax(vu, name='alphas')  # (B,T) shape
+    # the result has (B,D) shape
+    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
+
+    return output, alphas
+
+
+class LoopCell(Layer):
+    def __init__(self, hidden_size, attention_size, character_num, character_embed,
+                 cell_embed):
+        super(LoopCell, self).__init__()
+
+        # Hyper parameters
+        self.hidden_size = hidden_size
+        self.attention_size = attention_size
+        self.character_num = character_num
+        self.character_embed = character_embed
+        self.cell_embed = cell_embed
+
+    def build(self, batch_input_shape):
+        super(LoopCell, self).build(batch_input_shape)
+
+        # Trainable parameters
+        # Attention
+        # self.w_omega = self.add_weight("w_omega", shape=[self.hidden_size*2, self.attention_size],
+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
+        #                                trainable=True)
+        # self.b_omega = self.add_weight("b_omega", shape=[self.attention_size],
+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
+        #                                trainable=True)
+        # self.u_omega = self.add_weight("u_omega", shape=[self.attention_size],
+        #                                initializer=tf.random_uniform_initializer(-0.25, 0.25),
+        #                                trainable=True)
+
+        # Bi-LSTM
+        # self.forward_cell = LSTMCell(self.hidden_size, forget_bias=1.0, state_is_tuple=True)
+        # self.backward_cell = LSTMCell(self.hidden_size, forget_bias=1.0, state_is_tuple=True)
+        # self.bi_lism = Bidirectional(LSTM(self.hidden_size, return_sequences=True))
+        # self.bi_lism.build(input_shape=(None, self.character_num, self.character_embed))
+        # self.trainable_weights += self.bi_lism.trainable_weights
+        #
+        # self.self_attention = SeqSelfAttention(attention_activation='sigmoid')
+        # self.self_attention.build(input_shape=(None, self.character_num, 2*self.hidden_size))
+        # self.trainable_weights += self.self_attention.trainable_weights
+        # print(self.trainable_weights)
+
+        # DNN
+        # self.w1 = self.add_weight('W1', [2*self.attention_size, self.cell_embed],
+        #                           initializer=tf.random_uniform_initializer(-0.25, 0.25),
+        #                           trainable=True)
+        #
+        # self.b1 = self.add_weight('b1', [self.cell_embed],
+        #                           initializer=tf.zeros_initializer(),
+        #                           trainable=True)
+        # self.dense = Dense(self.cell_embed, activation="relu")
+        # print(batch_input_shape[0], batch_input_shape[1], batch_input_shape[2])
+        # self.dense.build(input_shape=(batch_input_shape[0]*batch_input_shape[1]*batch_input_shape[2],
+        #                               2*self.attention_size))
+        # self.trainable_weights += self.dense.trainable_weights
+
+    def call(self, inputs, mask=None, **kwargs):
+        def fn(x):
+            print("fn_0", x)
+
+            # (batch*height*width, character_num, hidden_size)
+            # outputs, last_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=self.forward_cell,
+            #                                                        cell_bw=self.backward_cell,
+            #                                                        inputs=x,
+            #                                                        dtype=tf.float32,
+            #                                                        time_major=False)
+            # (batch*height*width, character_num, 2*hidden_size)
+            # outputs = self.bi_lism(x)
+            # print("fn_1", outputs)
+
+            # (batch*height*width, character_num, 2*hidden_size)
+            # outputs = self.self_attention(outputs)
+            # print("fn_2", outputs)
+
+            # (batch*height*width, 2*hidden_size)
+            # outputs, _ = attention(outputs, self.w_omega, self.b_omega,
+            #                        self.u_omega, time_major=False)
+
+
+            # (batch*height*width, cell_embedding)
+            # outputs = tf.nn.xw_plus_b(outputs, self.w1, self.b1)
+            # outputs = self.dense(outputs)
+            # print("fn_3", outputs)
+            return outputs
+
+        batch = tf.shape(inputs)[0]
+        height = tf.shape(inputs)[1]
+        width = tf.shape(inputs)[2]
+
+        # (batch, height*width, character_num(time_step), character_embedding)
+        # inputs = tf.reshape(inputs, (tf.shape(inputs)[0],
+        #                              height*width,
+        #                              inputs.shape[3], inputs.shape[4]))
+
+        # (batch*height*width, character_num, character_embedding)
+        outputs = tf.reshape(inputs, (batch*height*width,
+                                      inputs.shape[3], inputs.shape[4]))
+
+        # (height*width, batch, character_num(time_step), character_embedding)
+        # inputs = tf.transpose(inputs, (1, 0, 2, 3))
+
+        # split height*width, each cell
+        # (height*width, batch, cell_embedding)
+        # outputs = tf.map_fn(fn=lambda x: fn(x), elems=inputs, dtype=tf.float32)
+        # print("loop_lstm_1", outputs)
+        # outputs = tf.squeeze(outputs, 0)
+
+        # (batch*height*width, 2*attention_size)
+        # outputs = fn(inputs)
+        # print("loop_lstm_2", outputs)
+
+        # (1, batch*height*width, 2*attention_size)
+        # outputs = tf.expand_dims(outputs, 0)
+        # print("loop_lstm_3", outputs)
+
+        # (batch*height*width, cell_embedding)
+        # outputs = Dense(self.cell_embed, activation="relu")(outputs)
+        # print("loop_lstm_3", outputs)
+
+        # (batch, height*width, cell_embedding)
+        # outputs = tf.transpose(outputs, (1, 0, 2))
+        # print("loop_lstm_2", outputs)
+
+        # (batch, height, width, cell_embedding)
+        # outputs = tf.reshape(outputs, (batch, height, width, self.cell_embed))
+        # print("loop_lstm_4", outputs)
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, self.character_num, self.character_embed
+
+
+class BatchReshape(Layer):
+    def __init__(self, cell_embed):
+        super(BatchReshape, self).__init__()
+        self.cell_embed = cell_embed
+
+    def call(self, inputs, mask=None, **kwargs):
+        input1 = inputs[0]
+        input2 = inputs[1]
+
+        batch = tf.shape(input1)[0]
+        height = tf.shape(input1)[1]
+        width = tf.shape(input1)[2]
+
+        # (batch, height, width, cell_embedding)
+        outputs = tf.reshape(input2, (batch, height, width, self.cell_embed))
+        print("batch_reshape", outputs)
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return None, None, None, self.cell_embed
+
+
+# def batch_reshape(x):
+#     return K.reshape(x, (batch, height, width, cell_embed))
+
+
+if __name__ == '__main__':
+    input_shape = (16, 8, 10, 60)
+    hidden_size = 64
+    attention_size = 64
+    character_num = 10
+    character_embed = 60
+    cell_embed = 8
+
+    # (batch_size, row_num, col_num, character_num, character_embedding)
+    X_train = np.random.uniform(0, 1, (10, 16, 8, 10, 60))
+    X_test = np.random.uniform(0, 1, (10, 16, 8, 10, 60))
+    y_train = np.random.uniform(0, 1, (10, 16, 8))
+    y_test = np.random.uniform(0, 1, (10, 16, 8))
+
+    _input = Input(shape=input_shape, dtype="float32")
+    batch = K.shape(_input)[0]
+    height = K.shape(_input)[1]
+    width = K.shape(_input)[2]
+    print(batch, height, width)
+
+    loop_bi_lstm = LoopCell(hidden_size, attention_size,
+                            character_num, character_embed,
+                            cell_embed)(_input)
+    print("model_2_1", loop_bi_lstm)
+    dense = Dense(cell_embed, activation="relu")(loop_bi_lstm)
+    print("model_2_2", dense)
+    reshape = Lambda(batch_reshape, output_shape=(height, width, cell_embed))(dense)
+    print("model_2_3", reshape)
+    u_net = u_net_small(loop_bi_lstm)
+    merge = Concatenate(axis=-1)([loop_bi_lstm, u_net])
+    dense = Dense(LoopCell().cell_embed, activation='relu')(merge)
+    dense = Dense(1, activation='sigmoid')(dense)
+    squeeze = Lambda(lambda x: K.squeeze(x, axis=-1))(dense)
+    model = models.Model(inputs=_input, outputs=squeeze)
+    model.summary(line_length=120)
+    model.compile(loss='binary_crossentropy', optimizer='adam')
+    model.fit(X_train, y_train,
+              epochs=2,
+              batch_size=1,
+              validation_data=(X_test, y_test))
+
+    # (batch_size, row_num, col_num, character_num, character_embedding)
+    X_train = np.random.uniform(0, 1, (5, 32, 24, 10, 60))
+    X_test = np.random.uniform(0, 1, (5, 32, 24, 10, 60))
+    y_train = np.random.uniform(0, 1, (5, 32, 24))
+    y_test = np.random.uniform(0, 1, (5, 32, 24))
+
+    model.fit(X_train, y_train,
+              epochs=2,
+              batch_size=1,
+              validation_data=(X_test, y_test))

+ 58 - 0
BiddingKG/dl/table_head/models/model_2.py

@@ -0,0 +1,58 @@
+import sys
+import os
+sys.path.append(os.path.abspath("../.."))
+from keras import layers, models
+import tensorflow as tf
+from BiddingKG.dl.table_head.models.my_average_pooling import MyAveragePooling1D
+from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
+
+
+def get_model(input_shape, output_shape):
+    # Input
+    input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_3 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_4 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_5 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_6 = layers.Input(shape=input_shape[1:], dtype="float32")
+
+    # Bi-LSTM
+    bi_lstm_1 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_1)
+    bi_lstm_2 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_2)
+    bi_lstm_3 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_3)
+    bi_lstm_4 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_4)
+    bi_lstm_5 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_5)
+    bi_lstm_6 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_6)
+
+    # Self-Attention
+    self_attention_1 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_1)
+    self_attention_2 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_2)
+    self_attention_3 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_3)
+    self_attention_4 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_4)
+    self_attention_5 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_5)
+    self_attention_6 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_6)
+
+    # Concat
+    concat_1 = layers.concatenate([self_attention_1, self_attention_2, self_attention_3])
+    concat_2 = layers.concatenate([self_attention_4, self_attention_5, self_attention_6])
+
+    # Dense + Sigmoid
+    dense_1 = layers.Dense(output_shape[0], activation="sigmoid")(concat_1)
+    dense_2 = layers.Dense(output_shape[0], activation="sigmoid")(concat_2)
+
+    # mask mean pooling
+    pool_1 = MyAveragePooling1D(axis=1)(dense_1)
+    pool_2 = MyAveragePooling1D(axis=1)(dense_2)
+
+    # Concat
+    concat = layers.concatenate([pool_1, pool_2])
+
+    # Dense
+    output = layers.Dense(10)(concat)
+    output = layers.Dense(1, activation="sigmoid", name='output')(output)
+
+    model = models.Model(inputs=[input_1, input_2, input_3, input_4, input_5, input_6],
+                         outputs=output)
+
+    model.summary()
+    return model

+ 49 - 0
BiddingKG/dl/table_head/models/tf_bi_lstm.py

@@ -0,0 +1,49 @@
+import tensorflow as tf
+from tensorflow.contrib.rnn import LSTMCell
+from tensorflow.contrib.rnn import MultiRNNCell
+
+
+class LstmBase:
+    """
+    build rnn cell
+    """
+    def build_rnn(self, hidden_size, num_layes):
+        cells = []
+        for i in range(num_layes):
+            cell = LSTMCell(num_units=hidden_size,
+                            state_is_tuple=True,
+                            initializer=tf.random_uniform_initializer(-0.25, 0.25))
+            cells.append(cell)
+        cells = MultiRNNCell(cells, state_is_tuple=True)
+
+        return cells
+
+
+class BiLstm(LstmBase):
+    """
+    define the lstm
+    """
+    def __init__(self, scope_name, hidden_size, num_layers):
+        super(BiLstm, self).__init__()
+        assert hidden_size % 2 == 0
+        hidden_size /= 2
+
+        self.fw_rnns = []
+        self.bw_rnns = []
+        for i in range(num_layers):
+            self.fw_rnns.append(self.build_rnn(hidden_size, 1))
+            self.bw_rnns.append(self.build_rnn(hidden_size, 1))
+
+        self.scope_name = scope_name
+
+    def __call__(self, input, input_len):
+        for idx, (fw_rnn, bw_rnn) in enumerate(zip(self.fw_rnns, self.bw_rnns)):
+            scope_name = '{}_{}'.format(self.scope_name, idx)
+            ctx, _ = tf.nn.bidirectional_dynamic_rnn(
+                fw_rnn, bw_rnn, input, sequence_length=input_len,
+                dtype=tf.float32, time_major=False,
+                scope=scope_name
+            )
+            input = tf.concat(ctx, -1)
+        ctx = input
+        return ctx

+ 82 - 0
BiddingKG/dl/table_head/models/u_net.py

@@ -0,0 +1,82 @@
+from keras.layers import Input, concatenate, Conv2D, MaxPooling2D, BatchNormalization, UpSampling2D
+from keras.layers import LeakyReLU
+
+
+def u_net_small(inputs, num_classes=1):
+    # 8
+    use_bias = False
+    down0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(inputs)
+    down0 = BatchNormalization()(down0)
+    down0 = LeakyReLU(alpha=0.)(down0)
+    down0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(down0)
+    down0 = BatchNormalization()(down0)
+    down0 = LeakyReLU(alpha=0.)(down0)
+    down0_pool = MaxPooling2D((2, 2), strides=(2, 2))(down0)
+
+    # 4
+    down1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(down0_pool)
+    down1 = BatchNormalization()(down1)
+    down1 = LeakyReLU(alpha=0.)(down1)
+    down1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(down1)
+    down1 = BatchNormalization()(down1)
+    down1 = LeakyReLU(alpha=0.)(down1)
+    down1_pool = MaxPooling2D((2, 2), strides=(2, 2))(down1)
+
+    # 2
+    down2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(down1_pool)
+    down2 = BatchNormalization()(down2)
+    down2 = LeakyReLU(alpha=0.)(down2)
+    down2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(down2)
+    down2 = BatchNormalization()(down2)
+    down2 = LeakyReLU(alpha=0.)(down2)
+    down2_pool = MaxPooling2D((2, 2), strides=(2, 2))(down2)
+
+    center = Conv2D(64, (3, 3), padding='same', use_bias=use_bias)(down2_pool)
+    center = BatchNormalization()(center)
+    center = LeakyReLU(alpha=0.)(center)
+    center = Conv2D(64, (3, 3), padding='same', use_bias=use_bias)(center)
+    center = BatchNormalization()(center)
+    center = LeakyReLU(alpha=0.)(center)
+
+    # 2
+    up2 = UpSampling2D((2, 2))(center)
+    up2 = concatenate([down2, up2], axis=3)
+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
+    up2 = BatchNormalization()(up2)
+    up2 = LeakyReLU(alpha=0.)(up2)
+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
+    up2 = BatchNormalization()(up2)
+    up2 = LeakyReLU(alpha=0.)(up2)
+    up2 = Conv2D(32, (3, 3), padding='same', use_bias=use_bias)(up2)
+    up2 = BatchNormalization()(up2)
+    up2 = LeakyReLU(alpha=0.)(up2)
+
+    # 4
+    up1 = UpSampling2D((2, 2))(up2)
+    up1 = concatenate([down1, up1], axis=3)
+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
+    up1 = BatchNormalization()(up1)
+    up1 = LeakyReLU(alpha=0.)(up1)
+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
+    up1 = BatchNormalization()(up1)
+    up1 = LeakyReLU(alpha=0.)(up1)
+    up1 = Conv2D(16, (3, 3), padding='same', use_bias=use_bias)(up1)
+    up1 = BatchNormalization()(up1)
+    up1 = LeakyReLU(alpha=0.)(up1)
+
+    # 8
+    up0 = UpSampling2D((2, 2))(up1)
+    up0 = concatenate([down0, up0], axis=3)
+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
+    up0 = BatchNormalization()(up0)
+    up0 = LeakyReLU(alpha=0.)(up0)
+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
+    up0 = BatchNormalization()(up0)
+    up0 = LeakyReLU(alpha=0.)(up0)
+    up0 = Conv2D(8, (3, 3), padding='same', use_bias=use_bias)(up0)
+    up0 = BatchNormalization()(up0)
+    up0 = LeakyReLU(alpha=0.)(up0)
+
+    # classify
+    # classify = Conv2D(num_classes, (1, 1), activation='sigmoid')(up0)
+    return up0

+ 17 - 0
BiddingKG/dl/table_head/preprocessing_test.py

@@ -0,0 +1,17 @@
+import codecs
+import pandas as pd
+from bs4 import BeautifulSoup
+from BiddingKG.dl.interface.extract import predict
+
+
+def test():
+    df = pd.read_excel("has_table_no_attach.xlsx")
+    for index, row in df.iterrows():
+        if index % 100 == 0:
+            print("Loop", index)
+        text = row['dochtmlcon']
+        predict(str(index), text)
+
+
+if __name__ == "__main__":
+    test()

+ 188 - 0
BiddingKG/dl/table_head/table_simplify.py

@@ -0,0 +1,188 @@
+#coding:utf-8
+import json
+import logging
+
+from BiddingKG.dl.table_head.pre_process import postgresql_util
+
+
+user_score = {
+    "test": 1.,
+    "test1": 0.83,
+    "test11": 0.82,
+    "test12": 0.74,
+    "test16": 0.83,
+    "test17": 0.77,
+    "test19": 0.79,
+    "test20": 0.82,
+    "test21": 0.73,
+    "test22": 0.64,
+    "test25": 0.77,
+    "test26": 0.80,
+    "test27": 0.72,
+    "test29": 0.8,
+    "test3": 0.,
+    "test7": 0.82,
+    "test8": 0.78,
+    "test9": 0.80,
+}
+
+
+def get_labeled_table():
+    sql = """
+    select id, update_user, table_text, pre_label, post_label
+    from label_table_head_info where status = 0
+    """
+
+    result_list = postgresql_util(sql, limit=1000000)
+    print("len(result_list)", len(result_list))
+    with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
+        not_eval_table_list = f.read()
+    not_eval_table_list = eval(not_eval_table_list)
+
+    table_list = []
+    # not_eval_table_list = []
+    for table in result_list:
+        pre_label = eval(table[3])
+        post_label = eval(table[4])
+        _id = table[0]
+        update_user = table[1]
+        table_text = table[2]
+        if _id in not_eval_table_list:
+            continue
+
+        try:
+            if table_text[0] == '"':
+                table_text = eval(table_text)
+            else:
+                table_text = table_text
+            table_text = table_text.replace('\\', '/')
+            table_text = eval(table_text)
+        except:
+            print("无法识别table_text", _id)
+            not_eval_table_list.append(_id)
+            continue
+
+        if post_label:
+            label_list = post_label
+        else:
+            label_list = pre_label
+
+        table_list.append([table_text, label_list, update_user, _id])
+    print("len(table_list)", len(table_list))
+    # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "w") as f:
+    #     f.write(str(not_eval_table_list))
+    return table_list
+
+
+def table_distance(table1, table2, thresh=0.85):
+    # flatten
+    table1 = [col for row in table1 for col in row]
+    table2 = [col for row in table2 for col in row]
+    while "" in table1:
+        table1.remove("")
+    while "" in table2:
+        table2.remove("")
+
+    equal_cnt = 0
+    not_equal_cnt = 0
+    equal_flag = 0
+    for col1 in table1:
+        find_flag = 0
+        for col2 in table2:
+            if col1 == col2:
+                equal_cnt += 1
+                find_flag = 1
+                break
+        if not find_flag:
+            not_equal_cnt += 1
+        # print(equal_cnt, not_equal_cnt)
+        if round(equal_cnt / max(len(table1), len(table2)), 2) >= thresh:
+            # print("> thresh")
+            equal_flag = 1
+            break
+        if round(not_equal_cnt / max(len(table1), len(table2)), 2) >= 1-thresh:
+            # print("> 1-thresh")
+            equal_flag = 0
+            break
+    return equal_flag
+
+
+def remove_duplicate(table_list):
+    logging.info("into remove_duplicate")
+    table_list.sort(key=lambda x: x[0])
+    delete_table_id_list = []
+    for i in range(len(table_list)):
+        delete_table_id_list = list(set(delete_table_id_list))
+        if i % 1000 == 0:
+            print("Loop", i, "len(delete_table_id_list)", len(delete_table_id_list))
+            logging.info("*")
+            with open(r"C:\Users\Administrator\Desktop\table_delete.txt", "w") as f:
+                f.write(str(delete_table_id_list))
+        table1 = table_list[i]
+        if len(table1[0]) <= 2 and len(table1[0][0]) <= 2:
+            delete_table_id_list.append(table1[3])
+            continue
+        for j in range(i+1, len(table_list)):
+            table2 = table_list[j]
+            if len(table2[0]) <= 2 and len(table2[0][0]) <= 2:
+                delete_table_id_list.append(table2[3])
+                continue
+            # 行数相差2以上忽略
+            if abs(len(table1[0]) - len(table2[0])) >= 2:
+                continue
+            # 列数相差2以上忽略
+            if abs(len(table1[0][0])) - len(table2[0][0]) >= 2:
+                continue
+            if table_distance(table1[0], table2[0]):
+                print("equal", table1[3], table2[3])
+                score1 = user_score.get(table1[2])
+                score2 = user_score.get(table2[2])
+                if score1 is None:
+                    score1 = 0.
+                if score2 is None:
+                    score2 = 0.
+                if score1 >= score2:
+                    delete_table_id_list.append(table2[3])
+                else:
+                    delete_table_id_list.append(table1[3])
+
+    delete_table_id_list = list(set(delete_table_id_list))
+    new_table_list = []
+    for table in table_list:
+        if table[3] not in delete_table_id_list:
+            new_table_list.append(table)
+    return new_table_list
+
+
+def eval_table(_str):
+    try:
+        if _str[0] == '"':
+            table_text = eval(_str)
+        else:
+            table_text = _str
+        table_text = table_text.replace('\\', '/')
+        table_text = eval(table_text)
+    except:
+        print("无法识别table_text")
+        table_text = ""
+    return table_text
+
+
+if __name__ == '__main__':
+    _list = get_labeled_table()
+    _list = remove_duplicate(_list)
+    _str = json.dumps(str(_list))
+    with open(r"C:\Users\Administrator\Desktop\table_simplify.txt", "w") as f:
+        f.write(_str)
+
+    # _str1 = "[['', '', 'Yes']]"
+    # _str2 = "[['', '', 'Yes', '']]"
+    # table1 = eval_table(_str1)
+    # table2 = eval_table(_str2)
+    #
+    # print(table_distance(table1, table2))
+
+    # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
+    #     not_eval_table_list = f.read()
+    # print(not_eval_table_list)
+    # not_eval_table_list = eval(not_eval_table_list)

BIN
BiddingKG/dl/table_head/vocab_word.pk