瀏覽代碼

记录字典替换原来的实体名

fangjiasheng 3 年之前
父節點
當前提交
8cec1accb7

+ 2 - 0
BiddingKG/dl/entityLink/entityLink.py

@@ -75,8 +75,10 @@ def link_entitys(list_entitys,on_value=0.8):
                 if _ent.if_dict_match == 1:
                     if len(_ent.entity_text) > len(_entity.entity_text):
                         # print("字典替换", _entity.entity_text, "->", _ent.entity_text)
+                        _entity.origin_entity_text = _entity.entity_text
                         _entity.entity_text = _ent.entity_text
                         used_linked_entitys.append(_ent)
+            # print(_entity.origin_entity_text, _entity.entity_text)
 
 
 def getEnterprisePath():

+ 1 - 0
BiddingKG/dl/interface/Entitys.py

@@ -172,6 +172,7 @@ class Entity():
         self.if_dict_match = 0  # 2021/12/21 新增,判断公司实体是否由字典识别得到
         self.is_total_money = 0  # 2021/12/29 新增,判断金额是否总价
         self.is_unit_money = 0  # 2021/12/29 新增,判断金额是否单价
+        self.origin_entity_text = ''  # 2022/1/5 新增,记录字典替换的原来的实体名
 
     def set_Role(self,role_label,role_values):
         self.label = int(role_label)

+ 3 - 3
BiddingKG/dl/table_head/loss.py

@@ -2,12 +2,12 @@ import tensorflow as tf
 import keras as K
 
 
-def focal_loss(gamma=2., alpha=.5):
-    def focal_loss_fixed(y_true, y_pred):
+def focal_loss(gamma=3., alpha=.25):
+    def f_loss(y_true, y_pred):
         pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
         pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
         return - K.backend.sum(alpha * K.backend.pow(1. - pt_1, gamma)
                                * K.backend.log(K.backend.epsilon()+pt_1))\
                - K.backend.sum((1-alpha) * K.backend.pow(pt_0, gamma)
                                * K.backend.log(1. - pt_0 + K.backend.epsilon()))
-    return focal_loss_fixed
+    return f_loss

+ 85 - 0
BiddingKG/dl/table_head/metrics.py

@@ -0,0 +1,85 @@
+from keras import backend as K
+
+
+def mcor(y_true, y_pred):
+    # matthews_correlation
+    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
+    y_pred_neg = 1 - y_pred_pos
+
+    y_pos = K.round(K.clip(y_true, 0, 1))
+    y_neg = 1 - y_pos
+
+    tp = K.sum(y_pos * y_pred_pos)
+    tn = K.sum(y_neg * y_pred_neg)
+
+    fp = K.sum(y_neg * y_pred_pos)
+    fn = K.sum(y_pos * y_pred_neg)
+
+    numerator = (tp * tn - fp * fn)
+    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
+    return numerator / (denominator + K.epsilon())
+
+
+def precision(y_true, y_pred):
+    """Precision metric.
+
+    Only computes a batch-wise average of precision.
+
+    Computes the precision, a metric for multi-label classification of
+    how many selected items are relevant.
+    """
+    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    _precision = true_positives / (predicted_positives + K.epsilon())
+    return _precision
+
+
+def recall(y_true, y_pred):
+    """Recall metric.
+
+    Only computes a batch-wise average of recall.
+
+    Computes the recall, a metric for multi-label classification of
+    how many relevant items are selected.
+    """
+    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
+    _recall = true_positives / (possible_positives + K.epsilon())
+    return _recall
+
+
+def f1(y_true, y_pred):
+    def recall(y_true, y_pred):
+        """Recall metric.
+
+        Only computes a batch-wise average of recall.
+
+        Computes the recall, a metric for multi-label classification of
+        how many relevant items are selected.
+        """
+        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
+        _recall = true_positives / (possible_positives + K.epsilon())
+        return _recall
+
+    def precision(y_true, y_pred):
+        """Precision metric.
+
+        Only computes a batch-wise average of precision.
+
+        Computes the precision, a metric for multi-label classification of
+        how many selected items are relevant.
+        """
+        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
+        _precision = true_positives / (predicted_positives + K.epsilon())
+        return _precision
+    _precision = precision(y_true, y_pred)
+    _recall = recall(y_true, y_pred)
+    return 2*((_precision*_recall)/(_precision+_recall+K.epsilon()))
+
+
+#you can use it like this
+# model.compile(loss='binary_crossentropy',
+#               optimizer= "adam",
+#               metrics=[mcor,recall, f1])

+ 17 - 12
BiddingKG/dl/table_head/models/model.py

@@ -1,6 +1,8 @@
+import sys
+import os
+sys.path.append(os.path.abspath("../.."))
 from keras import layers, models
-from keras.layers import Lambda
-
+import tensorflow as tf
 from BiddingKG.dl.table_head.models.my_average_pooling import MyAveragePooling1D
 from BiddingKG.dl.table_head.models.self_attention import SeqSelfAttention
 
@@ -9,31 +11,34 @@ def get_model(input_shape, output_shape):
     # Input
     input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
     input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_3 = layers.Input(shape=input_shape[1:], dtype="float32")
 
     # Embedding
-    embed_1 = layers.Embedding(input_dim=6624, output_dim=32,
-                               input_length=input_shape[1], mask_zero=True)(input_1)
-    embed_2 = layers.Embedding(input_dim=6624, output_dim=32,
-                               input_length=input_shape[1], mask_zero=True)(input_2)
+    # embed_1 = layers.Embedding(input_dim=6624, output_dim=32,
+    #                            input_length=input_shape[1], mask_zero=True)(input_1)
+    # embed_2 = layers.Embedding(input_dim=6624, output_dim=32,
+    #                            input_length=input_shape[1], mask_zero=True)(input_2)
 
     # Bi-LSTM
-    bi_lstm_1 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(embed_1)
-    bi_lstm_2 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(embed_2)
+    bi_lstm_1 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_1)
+    bi_lstm_2 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_2)
+    bi_lstm_3 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_3)
 
     # Self-Attention
     self_attention_1 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_1)
     self_attention_2 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_2)
+    self_attention_3 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_3)
 
     # Concat
-    concat = layers.concatenate([self_attention_1, self_attention_2])
+    concat = layers.concatenate([self_attention_1, self_attention_2, self_attention_3])
 
     # Dense + Softmax
-    output = layers.Dense(output_shape[0], activation="softmax")(concat)
+    output = layers.Dense(output_shape[0], activation="sigmoid")(concat)
 
     # mask mean pooling
-    output = MyAveragePooling1D(axis=1)(output)
+    output = MyAveragePooling1D(axis=1, name='my_average_pooling_1d')(output)
 
-    model = models.Model(inputs=[input_1, input_2], outputs=output)
+    model = models.Model(inputs=[input_1, input_2, input_3], outputs=output)
 
     model.summary()
     return model

+ 179 - 45
BiddingKG/dl/table_head/pre_process.py

@@ -1,9 +1,8 @@
-import sys
-import os
-sys.path.append(os.path.abspath("../.."))
 import psycopg2
 import numpy as np
 
+from BiddingKG.dl.common.Utils import embedding_word
+
 
 def get_sentence_index_list(sentence, dict_path='utils/ppocr_keys_v1.txt'):
     with open(dict_path, 'r') as f:
@@ -47,14 +46,14 @@ def get_data_from_sql(dim=10):
     select table_text, pre_label, post_label, id
     from label_table_head_info 
     where update_user <> 'test27' and table_box_cnt >= 4 and table_box_cnt <= 200 
-    limit 1000;
+    ;
     """
     # sql = """
     # select table_text, pre_label, post_label, id
     # from label_table_head_info
     # where id = 843
     # """
-    result_list = postgresql_util(sql, limit=10000)
+    result_list = postgresql_util(sql, limit=1000000)
 
     all_data_list = []
     all_data_label_list = []
@@ -90,22 +89,54 @@ def get_data_from_sql(dim=10):
         else:
             data_list, data_label_list = [], []
 
-        for data in data_list:
-            # 中文字符映射为index
-            data[0] = get_sentence_index_list(data[0])
-            data[1] = get_sentence_index_list(data[1])
-
-            # 维度不够,填充掩码0
-            if len(data[0]) < dim:
-                data[0] = data[0] + [0]*(dim-len(data[0]))
-            elif len(data[0]) > dim:
-                data[0] = data[0][:dim]
-            if len(data[1]) < dim:
-                data[1] = data[1] + [0]*(dim-len(data[1]))
-            elif len(data[1]) > dim:
-                data[1] = data[1][:dim]
         all_data_list += data_list
         all_data_label_list += data_label_list
+
+    print("len(all_data_list)", len(all_data_list))
+
+        #
+        # new_data_list = []
+        # for data in data_list:
+        #     # 中文字符映射为index
+        #     # data[0] = get_sentence_index_list(data[0])
+        #     # data[1] = get_sentence_index_list(data[1])
+        #     # 维度不够,填充掩码0
+        #     # if len(data[0]) < dim:
+        #     #     data[0] = data[0] + [0]*(dim-len(data[0]))
+        #     # elif len(data[0]) > dim:
+        #     #     data[0] = data[0][:dim]
+        #     # if len(data[1]) < dim:
+        #     #     data[1] = data[1] + [0]*(dim-len(data[1]))
+        #     # elif len(data[1]) > dim:
+        #     #     data[1] = data[1][:dim]
+        #
+        #     # 中文字符映射为Embedding
+        #     data = embedding_word(data, input_shape)
+        #     new_data_list.append(data)
+        #
+        # new_data_list = np.array(new_data_list)
+        # data_label_list = np.array(data_label_list)
+        # if np.array(new_data_list).shape[1:] == input_shape:
+        #     all_data_list.append(new_data_list)
+        #     all_data_label_list.append(data_label_list)
+
+    # # 防止concat太慢
+    # split_len = 1000
+    # _len = int(len(all_data_list) / split_len)
+    # all_data_list_1 = []
+    # all_data_list_2 = []
+    # for i in range(_len):
+    #     if i == _len - 1:
+    #         array1 = np.concatenate(all_data_list[i*split_len:])
+    #         array2 = np.concatenate(all_data_label_list[i*split_len:])
+    #     else:
+    #         array1 = np.concatenate(all_data_list[i*split_len:i*split_len+split_len])
+    #         array2 = np.concatenate(all_data_label_list[i*split_len:i*split_len+split_len])
+    #     all_data_list_1.append(array1)
+    #     all_data_list_2.append(array2)
+    # all_data_list = np.concatenate(all_data_list_1)
+    # all_data_label_list = np.concatenate(all_data_list_2)
+
     return all_data_list, all_data_label_list
 
 
@@ -121,6 +152,13 @@ def table_process(text_list, label_list, _id):
         row = text_list[i]
         row_label = label_list[i]
 
+        if i > 0:
+            last_row = text_list[i-1]
+            last_row_label = label_list[i-1]
+        else:
+            last_row = []
+            last_row_label = []
+
         if i < len(text_list) - 1:
             next_row = text_list[i+1]
             next_row_label = label_list[i+1]
@@ -132,47 +170,86 @@ def table_process(text_list, label_list, _id):
             col = row[j]
             col_label = row_label[j]
 
+            # 超出表格置为None, 0
+            if j > 0:
+                last_col = row[j-1]
+                last_col_label = row_label[j-1]
+            else:
+                last_col = None
+                last_col_label = 0
+
             if j < len(row) - 1:
                 next_col = row[j+1]
                 next_col_label = row_label[j+1]
             else:
-                next_col = ""
-                next_col_label = ""
+                next_col = None
+                next_col_label = 0
+
+            if last_row:
+                last_row_col = last_row[j]
+                last_row_col_label = last_row_label[j]
+            else:
+                last_row_col = None
+                last_row_col_label = 0
 
             if next_row:
                 next_row_col = next_row[j]
                 next_row_col_label = next_row_label[j]
             else:
-                next_row_col = ""
-                next_row_col_label = ""
-
-            if next_col:
-                if col != next_col:
-                    data_list.append([col, next_col])
-                    data_label_list.append([int(col_label), int(next_col_label)])
-            if next_row_col:
-                if col != next_row_col:
-                    data_list.append([col, next_row_col])
-                    data_label_list.append([int(col_label), int(next_row_col_label)])
+                next_row_col = None
+                next_row_col_label = 0
+
+            # 三元组有一对不相等就作为数据
+            # if col != next_col or col != last_col:
+            data_list.append([last_col, col, next_col])
+            data_label_list.append([int(last_col_label), int(col_label),
+                                    int(next_col_label)])
+
+            # if col != next_row_col or col != last_row_col:
+            data_list.append([last_row_col, col, next_row_col])
+            data_label_list.append([int(last_row_col_label), int(col_label),
+                                    int(next_row_col_label)])
 
     return data_list, data_label_list
 
 
-def get_data_from_file():
-    data_path = 'train_data/data.txt'
-    data_label_path = 'train_data/data_label.txt'
+def get_data_from_file(file_type):
+    if file_type == 'np':
+        data_path = 'train_data/data_3.npy'
+        data_label_path = 'train_data/data_label_3.npy'
 
-    with open(data_path, 'r') as f:
-        data_list = f.readlines()
-    with open(data_label_path, 'r') as f:
-        data_label_list = f.readlines()
+        array1 = np.load(data_path)
+        array2 = np.load(data_label_path)
+        return array1, array2
+    elif file_type == 'txt':
+        data_path = 'train_data/data.txt'
+        data_label_path = 'train_data/data_label.txt'
 
-    for i in range(len(data_list)):
-        data_list[i] = eval(data_list[i][:-1])
-        data_label_list[i] = eval(data_label_list[i][:-1])
+        with open(data_path, 'r') as f:
+            data_list = f.readlines()
+        with open(data_label_path, 'r') as f:
+            data_label_list = f.readlines()
 
-    print(len(data_list))
-    return data_list, data_label_list
+        # for i in range(len(data_list)):
+        #     data_list[i] = eval(data_list[i][:-1])
+        #     data_label_list[i] = eval(data_label_list[i][:-1])
+        return data_list, data_label_list
+    else:
+        print("file type error! only np and txt supported")
+        raise Exception
+
+
+def processed_save_to_np():
+    array1, array2 = get_data_from_sql()
+    np.save('train_data/data_3.npy', array1)
+    np.save('train_data/data_label_3.npy', array2)
+
+    # with open('train_data/data.txt', 'w') as f:
+    #     for line in list1:
+    #         f.write(str(line) + "\n")
+    # with open('train_data/data_label.txt', 'w') as f:
+    #     for line in list2:
+    #         f.write(str(line) + "\n")
 
 
 def processed_save_to_txt():
@@ -185,5 +262,62 @@ def processed_save_to_txt():
             f.write(str(line) + "\n")
 
 
+def data_balance():
+    array1, array2 = get_data_from_file()
+    data_list = array2.tolist()
+    all_cnt = len(data_list)
+    cnt_0 = 0
+    cnt_1 = 0
+    for data in data_list:
+        if data[0] == 1 or data[1] == 1:
+            cnt_1 += 1
+        else:
+            cnt_0 += 1
+    print("all_cnt", all_cnt)
+    print("label has 1", cnt_1)
+    print("label all 0", cnt_0)
+
+
+def test_embedding():
+    output_shape = (2, 1, 60)
+    data = [[None], [None]]
+    result = embedding_word(data, output_shape)
+    print(result)
+
+
+def my_data_loader(data_list, data_label_list, batch_size):
+    data_num = len(data_list)
+
+    # 定义Embedding输出
+    output_shape = (3, 10, 60)
+
+    # batch循环取数据
+    i = 0
+    while True:
+        new_data_list = []
+        for j in range(batch_size):
+            if i >= data_num:
+                i = 0
+
+            # 中文字符映射为Embedding
+            data = eval(data_list[i][:-1])
+            data = embedding_word(data, output_shape)
+            if data.shape == output_shape:
+                new_data_list.append(data)
+            i += 1
+
+        new_data_list = np.array(new_data_list)
+        data_label_list = np.array(data_label_list)
+        X = new_data_list
+        Y = data_label_list
+        # (table_num, 3 sentences, dim characters, embedding) -> (3, table_num, dim, embedding)
+        X = np.transpose(X, (1, 0, 2, 3))
+
+        yield [X[0], X[1], X[2]], Y
+
+
 if __name__ == '__main__':
-    get_data_from_file()
+    processed_save_to_txt()
+    # data_balance()
+
+    # test_embedding()

+ 54 - 26
BiddingKG/dl/table_head/train.py

@@ -1,38 +1,44 @@
 import sys
 import os
-sys.path.append(os.path.abspath("../.."))
-from keras import optimizers
-from tensorflow.contrib.metrics import f1_score
-from tensorflow.python.ops.metrics_impl import precision, recall
+sys.path.append(os.path.abspath("../../.."))
+os.environ['KERAS_BACKEND'] = 'tensorflow'
+import keras
+from BiddingKG.dl.table_head.metrics import precision, recall, f1
+from keras import optimizers, Model
 from BiddingKG.dl.table_head.models.model import get_model
 from BiddingKG.dl.table_head.loss import focal_loss
-from keras.callbacks import ModelCheckpoint
-from BiddingKG.dl.table_head.pre_process import get_data_from_file
+from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
+from BiddingKG.dl.table_head.pre_process import get_data_from_file, get_data_from_sql, my_data_loader
 import numpy as np
+from keras import backend as K
 
 
-input_shape = (2, 10)
-output_shape = (2,)
-pretrained_path = ""
+input_shape = (3, 10, 60)
+output_shape = (3,)
+batch_size = 1024
+epochs = 1000
+pretrained_path = "checkpoints/best.hdf5"
 checkpoint_path = "checkpoints/"
 PRETRAINED = False
 CHECKPOINT = False
 
 
 def train():
+    # GPU available
+    print("gpus", K.tensorflow_backend._get_available_gpus())
+
     # Data
-    data_x, data_y = get_data_from_file()
-    data_x = np.array(data_x)
-    data_y = np.array(data_y)
+    data_x, data_y = get_data_from_file('txt')
+    print("finish read data", len(data_x))
 
     # Split -> Train, Test
     split_size = int(len(data_x)*0.1)
     test_x, test_y = data_x[:split_size], data_y[:split_size]
     train_x, train_y = data_x[split_size:], data_y[split_size:]
 
-    # (table_num, 2 sentences, dim characters) -> (2, table_num, dim)
-    train_x = np.transpose(train_x, (1, 0, 2))
-    test_x = np.transpose(test_x, (1, 0, 2))
+    # Data Loader
+    train_data_loader = my_data_loader(train_x, train_y, batch_size=batch_size)
+    test_data_loader = my_data_loader(test_x, test_y, batch_size=batch_size)
 
     # Model
     model = get_model(input_shape, output_shape)
@@ -47,18 +53,40 @@ def train():
     else:
         print("no checkpoint")
 
-    filepath = '{epoch:02d}-{val_loss:.2f}.h5'
-    checkpoint = ModelCheckpoint(checkpoint_path+filepath+".hdf5", monitor=focal_loss(),
-                                 verbose=1, save_best_only=True, mode='min')
-    model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=focal_loss(),
-                  metrics=[focal_loss()])
+    filepath = 'e-{epoch:02d}-loss-{val_loss:.2f}'
+    checkpoint = ModelCheckpoint(checkpoint_path+filepath+".hdf5", monitor='val_f1',
+                                 verbose=1, save_best_only=True, mode='max')
+
+    model.compile(optimizer=optimizers.Adam(lr=0.0005), loss='binary_crossentropy',
+                  metrics=['binary_crossentropy', 'acc',
+                           precision, recall, f1])
+
+    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.1, patience=5,
+                            verbose=1, mode='max', cooldown=0, min_lr=0)
+
+    model.fit_generator(train_data_loader,
+                        steps_per_epoch=max(1, len(train_x) // batch_size),
+                        callbacks=[checkpoint, rlu],
+                        validation_data=test_data_loader,
+                        validation_steps=max(1, len(test_x) // batch_size),
+                        epochs=epochs)
+
+    # model.fit(x=[train_x[0], train_x[1], train_x[2]], y=train_y,
+    #           validation_data=([test_x[0], test_x[1], test_x[2]], test_y),
+    #           epochs=epochs, batch_size=256, shuffle=True,
+    #           callbacks=[checkpoint, rlu])
+
+    return model, test_x
+
+
+def print_layer_output(model, data):
+    middle_layer = Model(inputs=model.inputs,
+                         outputs=model.get_layer('input_2').output)
 
-    print(train_x.shape, train_y.shape)
-    model.fit(x=[train_x[0], train_x[1]], y=train_y,
-              validation_data=([test_x[0], test_x[1]], test_y),
-              epochs=100, batch_size=128, shuffle=True,
-              callbacks=[checkpoint])
+    middle_layer_output = middle_layer.predict([data[0], data[1]])
+    print(middle_layer_output)
+    return
 
 
 if __name__ == '__main__':
-    train()
+    model, data = train()