|
@@ -0,0 +1,619 @@
|
|
|
+#!/usr/bin/python3
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# @Author : bidikeji
|
|
|
+# @Time : 2021/7/28 0028 11:32
|
|
|
+
|
|
|
+import os
|
|
|
+# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
|
|
+import sys
|
|
|
+sys.path.append(os.path.abspath("../../.."))
|
|
|
+import pandas as pd
|
|
|
+
|
|
|
+from BiddingKG.dl.interface.modelFactory import Model_role_classify_word
|
|
|
+from BiddingKG.dl.common.Utils import *
|
|
|
+import tensorflow as tf
|
|
|
+import tensorflow.keras.backend as K
|
|
|
+# from tensorflow.keras import layers, models,optimizers,losses,callbacks
|
|
|
+
|
|
|
+from keras import layers, models,optimizers,losses,callbacks
|
|
|
+# import keras.backend as K
|
|
|
+# from keras.models import Model
|
|
|
+from keras.engine.topology import Layer
|
|
|
+
|
|
|
+from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
|
|
|
+
|
|
|
+def recall(y_true, y_pred):
|
|
|
+ '''
|
|
|
+ 计算召回率
|
|
|
+ @Argus:
|
|
|
+ y_true: 正确的标签
|
|
|
+ y_pred: 模型预测的标签
|
|
|
+
|
|
|
+ @Return
|
|
|
+ 召回率
|
|
|
+ '''
|
|
|
+ c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
|
|
|
+ c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
|
|
|
+ if c3 == 0:
|
|
|
+ return 0
|
|
|
+ recall = c1 / c3
|
|
|
+ return recall
|
|
|
+
|
|
|
+
|
|
|
+def f1_score(y_true, y_pred):
|
|
|
+ '''
|
|
|
+ 计算F1
|
|
|
+
|
|
|
+ @Argus:
|
|
|
+ y_true: 正确的标签
|
|
|
+ y_pred: 模型预测的标签
|
|
|
+
|
|
|
+ @Return
|
|
|
+ F1值
|
|
|
+ '''
|
|
|
+
|
|
|
+ c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
|
|
|
+ c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
|
|
|
+ c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
|
|
|
+ precision = c1 / c2
|
|
|
+ if c3 == 0:
|
|
|
+ recall = 0
|
|
|
+ else:
|
|
|
+ recall = c1 / c3
|
|
|
+ f1_score = 2 * (precision * recall) / (precision + recall)
|
|
|
+ return f1_score
|
|
|
+
|
|
|
+
|
|
|
+def precision(y_true, y_pred):
|
|
|
+ '''
|
|
|
+ 计算精确率
|
|
|
+
|
|
|
+ @Argus:
|
|
|
+ y_true: 正确的标签
|
|
|
+ y_pred: 模型预测的标签
|
|
|
+
|
|
|
+ @Return
|
|
|
+ 精确率
|
|
|
+ '''
|
|
|
+ c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
|
|
|
+ c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
|
|
|
+ precision = c1 / c2
|
|
|
+ return precision
|
|
|
+
|
|
|
+seq_len = 30 # 20
|
|
|
+sp = 30
|
|
|
+lb2id = {'招标人':0,
|
|
|
+ '代理人':1,
|
|
|
+ '中标人':2,
|
|
|
+ '第二候选人':3,
|
|
|
+ '第三候选人':4,
|
|
|
+ '其他角色':5}
|
|
|
+
|
|
|
+
|
|
|
+def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
|
|
|
+ # assert len(input_shape)==3
|
|
|
+ list_input = []
|
|
|
+ for i in range(input_shape[0]):
|
|
|
+ list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
|
|
|
+ list_embedding = []
|
|
|
+
|
|
|
+ embedding_input = list_input
|
|
|
+ embedding = layers.Embedding(len(vocab),input_shape[2],
|
|
|
+ weights=[embedding_weights] if embedding_weights is not None else None,
|
|
|
+ mask_zero=True,trainable=True,name="char_embeding")
|
|
|
+ for i in range(len(embedding_input)):
|
|
|
+ list_embedding.append(embedding(embedding_input[i]))
|
|
|
+
|
|
|
+ list_w2v = list_embedding
|
|
|
+ list_lstm = []
|
|
|
+
|
|
|
+ # list_lstm.append(layers.Bidirectional(layers.GRU(60, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[0])) #dropout=0.5, recurrent_dropout=0.5
|
|
|
+ # list_lstm.append(layers.Bidirectional(layers.GRU(60, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[1]))
|
|
|
+
|
|
|
+ list_lstm.append(layers.Bidirectional(layers.LSTM(120, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[0])) #dropout=0.5, recurrent_dropout=0.5
|
|
|
+ list_lstm.append(layers.Bidirectional(layers.LSTM(120, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[1]))
|
|
|
+
|
|
|
+ concat = layers.concatenate(list_lstm, axis=1)
|
|
|
+
|
|
|
+ out = layers.Dense(classes,activation="softmax")(concat)
|
|
|
+ model = models.Model(list_input,out)
|
|
|
+ model.compile(optimizer=optimizers.Adam(lr=0.001),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
|
|
|
+ model.summary()
|
|
|
+
|
|
|
+ return model
|
|
|
+
|
|
|
+def labeling(label, out_len=6):
|
|
|
+ out = np.zeros((out_len))
|
|
|
+ out[label] = 1
|
|
|
+ return out
|
|
|
+
|
|
|
+def word2id(df, seq_len=seq_len, is_test=False):
|
|
|
+ train_x = []
|
|
|
+ train_y = []
|
|
|
+ test_x = []
|
|
|
+ test_y = []
|
|
|
+ # print(set(df['label']))
|
|
|
+ # print(set(lb2id))
|
|
|
+ # if set(df['label']) == set(lb2id):
|
|
|
+ # df['label'] = df['label'].apply(lambda x:lb2id[x])
|
|
|
+
|
|
|
+ for before, text, after, label in zip(df["front20"], df["entity_text"], df["behind20"], df["new_label"]):
|
|
|
+ before = before if isinstance(before, str) else ""
|
|
|
+ text = text if isinstance(text, str) else ""
|
|
|
+ after = after if isinstance(after, str) else ""
|
|
|
+
|
|
|
+ b = before.find('。')
|
|
|
+ if b!=-1: # 分句看不到前面句子
|
|
|
+ before = before[b+1:]
|
|
|
+ e = after.find('。')
|
|
|
+ if e!=-1:
|
|
|
+ after = after[:e+1]
|
|
|
+
|
|
|
+ x = encodeInput([before, after], word_len=seq_len, word_flag=True, userFool=False)
|
|
|
+ if is_test:
|
|
|
+ y = label
|
|
|
+ else:
|
|
|
+ y = labeling(label)
|
|
|
+ train_x.append(x)
|
|
|
+ train_y.append(y)
|
|
|
+ return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y)
|
|
|
+
|
|
|
+def fix_digit_eng(text):
|
|
|
+ '''
|
|
|
+ 处理数字及英文编号等
|
|
|
+ :param text:
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ text = re.sub('第[一二三1-3]([条项章]|中学|医院|附属)|第三方(服务机构)?', 'xxx', text)
|
|
|
+ text = re.sub('第01(中标|成交)?候选人', '第一中标候选人', text)
|
|
|
+ text = re.sub('标段[一二三1-3]', '标段d', text)
|
|
|
+ text = re.sub('第[一二三1-3](标段?|[分子标]?包)', 'd标段', text)
|
|
|
+ text = re.sub('[a-zA-Z][a-zA-Z0-9=&_—-]{3,}', 'abc', text)
|
|
|
+ text = re.sub('[【(\[][0-9]{2,}[\])】]|\d+([::.-]\d+)+', 'd', text)
|
|
|
+ text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)
|
|
|
+ text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text)
|
|
|
+ text = re.sub('序号:\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、', '', text)
|
|
|
+ return text.replace('(', '(').replace(')', ')')
|
|
|
+
|
|
|
+def train():
|
|
|
+ # df_train = pd.read_excel('traindata/df_train_20230908.xlsx')
|
|
|
+ # df_test = pd.read_excel('traindata/df_test_20230908.xlsx')
|
|
|
+
|
|
|
+ # df_train = pd.read_excel('traindata/df_train_20230912.xlsx')
|
|
|
+ # df_test = pd.read_excel('traindata/df_test_20230912.xlsx')
|
|
|
+
|
|
|
+ # df_train = pd.read_excel('traindata/df_train_20230912_predict.xlsx')
|
|
|
+ # df_test = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
|
|
|
+
|
|
|
+ # df_train = pd.read_excel('traindata/df_train_20230912_2.xlsx')
|
|
|
+ # df_test = pd.read_excel('traindata/df_test_20230912_2.xlsx')
|
|
|
+ # df1 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_train.xlsx')
|
|
|
+ # df2 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_test.xlsx')
|
|
|
+ # df3 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_train.xlsx')
|
|
|
+ # df4 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_test.xlsx')
|
|
|
+ # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
|
|
|
+ # df_train = df_train.append([df1,df3, df, df, df, df], ignore_index=True)
|
|
|
+ # df_test = df_test.append([df2,df4, df], ignore_index=True)
|
|
|
+
|
|
|
+ df_train = pd.read_excel('traindata/所有训练测试数据_train.xlsx')
|
|
|
+ df_test = pd.read_excel('traindata/所有训练测试数据_test.xlsx')
|
|
|
+ df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
|
|
|
+ df_train = df_train.append([df, df, df, df], ignore_index=True)
|
|
|
+ df_test = df_test.append([df], ignore_index=True)
|
|
|
+
|
|
|
+
|
|
|
+ df_train = df_train.sample(frac=1)
|
|
|
+ df_test = df_test.sample(frac=1)
|
|
|
+
|
|
|
+ df_train['front20'] = df_train['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
|
|
|
+ df_train['behind20'] = df_train['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
|
|
|
+ df_test['front20'] = df_test['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
|
|
|
+ df_test['behind20'] = df_test['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
|
|
|
+
|
|
|
+ # df_train['front20'] = df_train['front'].apply(lambda x: str(x)[-seq_len:])
|
|
|
+ # df_train['behind20'] = df_train['behind'].apply(lambda x: str(x)[:seq_len])
|
|
|
+ # df_test['front20'] = df_test['front'].apply(lambda x: str(x)[-seq_len:])
|
|
|
+ # df_test['behind20'] = df_test['behind'].apply(lambda x: str(x)[:seq_len])
|
|
|
+
|
|
|
+ df_train.fillna("", inplace=True)
|
|
|
+ df_test.fillna("", inplace=True)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ if 'relabel' in df_train.columns:
|
|
|
+ df_train['new_label'] = df_train.apply(lambda x: int(x['relabel']) if x['relabel'] !="" else int(x['new_label']), axis=1)
|
|
|
+ if 'relabel' in df_test.columns:
|
|
|
+ df_test['new_label'] = df_test.apply(lambda x: int(x['relabel']) if x['relabel'] !="" else int(x['new_label']), axis=1)
|
|
|
+ print('df_train', set(df_train['new_label']), set(df_train['relabel']))
|
|
|
+ print('df_test', set(df_test['new_label']), set(df_test['relabel']))
|
|
|
+
|
|
|
+ df_train = df_train[df_train['new_label'].isin([0,1,2,3,4,5])]
|
|
|
+ df_test = df_test[df_test['new_label'].isin([0,1,2,3,4,5])]
|
|
|
+ print('训练数据:%d,测试数据:%d'%(len(df_train), len(df_test)))
|
|
|
+
|
|
|
+ print(set(df_train['new_label']), set(lb2id.values()))
|
|
|
+ assert set(df_train['new_label'])==set(lb2id.values())
|
|
|
+
|
|
|
+ train_x, train_y = word2id(df_train)
|
|
|
+ print('train_x.shape', train_x.shape)
|
|
|
+ print('train_y.shape', train_y.shape)
|
|
|
+ print('train_x: ', train_x[0])
|
|
|
+ test_x, test_y = word2id(df_test)
|
|
|
+ with tf.Session() as sess:
|
|
|
+ vocab, matrix = getVocabAndMatrix(getModel_word())
|
|
|
+ model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
|
|
|
+ print("loading weights")
|
|
|
+ # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
|
|
|
+ # model.load_weights("log/ep008-loss0.103-val_loss0.109-f1_score0.970.h5",by_name=True) # 加 多一个dense
|
|
|
+ # model.load_weights("log/ep021-loss0.078-val_loss0.104-f1_score0.969.h5",by_name=True) # 加 多一个lstm连接前后lstm输出
|
|
|
+
|
|
|
+ callback = callbacks.ModelCheckpoint(
|
|
|
+ filepath="log/" + "ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",
|
|
|
+ monitor="val_loss", save_best_only=True, save_weights_only=True, mode="min")
|
|
|
+ model.fit(x=[train_x[0],train_x[1]], y=train_y, batch_size=512, epochs=100, callbacks=[callback],
|
|
|
+ validation_data=[[test_x[0],test_x[1]], test_y])
|
|
|
+def test():
|
|
|
+ # df_val = pd.read_excel('traindata/df_test_20230908.xlsx')
|
|
|
+ # df_val = pd.read_excel('traindata/df_test_20230908_predict.xlsx')
|
|
|
+ # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据.xlsx')
|
|
|
+ # df_val = pd.read_excel('traindata/df_test_20230912_2.xlsx')
|
|
|
+ # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
|
|
|
+ # df_val = pd.read_excel('traindata/df_train_20230912_2.xlsx')
|
|
|
+ # df_val = pd.read_excel('traindata/角色实体分类新旧数据汇总.xlsx')
|
|
|
+ # df_val = pd.read_excel('E:/角色金额数据/数据库验证数据原模型识别结果20230926.xlsx')
|
|
|
+ # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
|
|
|
+
|
|
|
+ # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果.xlsx')
|
|
|
+ # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果60000-90000.xlsx')
|
|
|
+ # df_val = pd.read_excel('traindata/2023-08-24所有公告_重新预测结果40000-60000_重新不一致结果.xlsx')
|
|
|
+
|
|
|
+ # df_val = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='不确定角色表达')
|
|
|
+ # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_筛选前后文不同的数据.xlsx')
|
|
|
+ # df_val = pd.read_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果.xlsx')
|
|
|
+ # df_val = pd.read_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果_re.xlsx')
|
|
|
+ # df_val = pd.read_excel('E:\实体识别数据/少于10条关键词补充数据.xlsx')
|
|
|
+
|
|
|
+ # df_val = pd.read_excel('traindata/所有训练测试数据_add.xlsx')
|
|
|
+ df_val = pd.read_excel('traindata/所有训练测试数据_test.xlsx')
|
|
|
+
|
|
|
+ # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
|
|
|
+ # df_val = df_val.append([df], ignore_index=True)
|
|
|
+ # df_val = df_val[['entity_id', 'docid', 'label', 'front', 'entity_text', 'behind',
|
|
|
+ # 'new_label', 'relabel', 'kws', 'new_old', 'front20', 'behind20',
|
|
|
+ # 'front_reverse', 'pred_new', 'prob_new', 'new=lb']]
|
|
|
+
|
|
|
+ # df_val = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据.xlsx')
|
|
|
+
|
|
|
+ # df_val = pd.read_excel('traindata/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据_predict.xlsx')
|
|
|
+ # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据_predict_重新标注数据20230919.xlsx')
|
|
|
+
|
|
|
+ lb2id = {'招标人': 0, '代理人': 1, '中标人': 2, '第二候选人': 3, '第三候选人': 4, '其他角色': 5}
|
|
|
+ # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据.xlsx')
|
|
|
+ # df_val['label'] = df_val.apply(lambda x: x['relabel'] if x['relabel']!='' else x['label'], axis=1)
|
|
|
+ # df_val['new_label'] = df_val['label'].apply(lambda x: lb2id[x])
|
|
|
+ # df_val['label'] = df_val['label'].apply(lambda x: lb2id[x])
|
|
|
+ # df_val['relabel'] = df_val['relabel'].apply(lambda x: lb2id.get(x, ''))
|
|
|
+
|
|
|
+ # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据_predict.xlsx')
|
|
|
+ # df_val.fillna('', inplace=True)
|
|
|
+ # print('测试公告数量:', len(df_val), set(df_val['new_label']))
|
|
|
+ # df_val['new_label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] != '' else x['new_label'], axis=1)
|
|
|
+
|
|
|
+ # # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
|
|
|
+ # df_val = pd.read_excel('traindata/df_test_20230912_2.xlsx')
|
|
|
+ # print(df_val.columns)
|
|
|
+ # df2 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_test.xlsx')
|
|
|
+ # df4 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_test.xlsx')
|
|
|
+ # # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
|
|
|
+ # print(df2.columns)
|
|
|
+ # df_val = df_val.append([df2, df4], ignore_index=True)
|
|
|
+ # df_val = df_val[['entity_id', 'docid', 'label', 'front', 'entity_text', 'behind',
|
|
|
+ # 'new_label', 'relabel', 'kws', 'new_old', 'front20', 'behind20',
|
|
|
+ # 'front_reverse', 'pred_new', 'prob_new', 'new=lb']]
|
|
|
+
|
|
|
+ df_val.fillna('', inplace=True)
|
|
|
+
|
|
|
+ # df_val = df_val[df_val['relabel']!=6]
|
|
|
+
|
|
|
+ # for i in df_val.index:
|
|
|
+ # b = df_val.loc[i, 'front']
|
|
|
+ # e = df_val.loc[i, 'behind']
|
|
|
+ # if not isinstance(b, str):
|
|
|
+ # print('异常数据', i, type(b))
|
|
|
+ # if not isinstance(e, str):
|
|
|
+ # print('异常数据', i, type(e))
|
|
|
+
|
|
|
+ if 'new_label' in df_val.columns:
|
|
|
+ if 'relabel' in df_val.columns:
|
|
|
+ df_val['new_label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] in [0,1,2,3,4,5] else x['new_label'], axis=1)
|
|
|
+ else:
|
|
|
+ df_val['new_label'] = df_val['label']
|
|
|
+ # df_val['new_label'] = df_val['new_label'].apply(lambda x: x if x in [0, 1, 2, 3, 4, 5] else 5)
|
|
|
+ # df_val = df_val[df_val['new_label'].isin([0,1,2,3,4,5])]
|
|
|
+ print('测试公告数量:', len(df_val), set(df_val['new_label']))
|
|
|
+ df_val['new_label'] = df_val['new_label'].apply(lambda x: int(x))
|
|
|
+
|
|
|
+ df_val['front20'] = df_val['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
|
|
|
+ df_val['behind20'] = df_val['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
|
|
|
+
|
|
|
+ # df_val.drop_duplicates(subset=['front20', 'behind20'], inplace=True)
|
|
|
+ # print('测试公告去重后数量:', len(df_val))
|
|
|
+
|
|
|
+ # df_val['front20'] = df_val['front'].apply(lambda x: str(x)[-seq_len:])
|
|
|
+ # df_val['behind20'] = df_val['behind'].apply(lambda x: str(x)[:seq_len])
|
|
|
+
|
|
|
+ df_val['front_reverse'] = df_val['front20'].apply(lambda x: x[-6:][::-1])
|
|
|
+
|
|
|
+
|
|
|
+ # df_val['label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] !="" else x['label'], axis=1)
|
|
|
+ # df_val['label'] = df_val['label'].apply(lambda x:lb2id[x] if x in lb2id else x)
|
|
|
+
|
|
|
+ df_val.reset_index(drop=True, inplace=True)
|
|
|
+ val_x, val_y = word2id(df_val, seq_len=seq_len, is_test=True)
|
|
|
+ # val_x = np.transpose(np.array(train_x), (1, 0, 2))
|
|
|
+
|
|
|
+ # old_x, old_y = word2id(df_val, seq_len=50)
|
|
|
+ # old_x = np.transpose(np.array(old_x), (1, 0, 2))
|
|
|
+ role_old = Model_role_classify_word()
|
|
|
+
|
|
|
+ with tf.Session() as sess:
|
|
|
+ vocab, matrix = getVocabAndMatrix(getModel_word())
|
|
|
+ model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
|
|
|
+ print("loading weights")
|
|
|
+ # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
|
|
|
+ # model.load_weights("log/ep006-loss0.174-val_loss0.234-f1_score0.917.h5",by_name=True, skip_mismatch=True)
|
|
|
+ # model.load_weights("log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5",by_name=True, skip_mismatch=True)
|
|
|
+ # model.load_weights("log/ep014-loss0.091-val_loss0.110-f1_score0.968.h5",by_name=True, skip_mismatch=True)
|
|
|
+ # model.load_weights("log/ep008-loss0.162-val_loss0.173-f1_score0.947.h5",by_name=True) # 20230425 取消实体,合并前后输入 效果不佳,招标代理分不清,特别是 受。。。委托这种
|
|
|
+ # model.load_weights("log/ep009-loss0.104-val_loss0.115-f1_score0.966.h5",by_name=True) # 20230425 取消实体,前后分别输入
|
|
|
+ # model.load_weights("log/ep008-loss0.103-val_loss0.109-f1_score0.970.h5",by_name=True) # 20230425 取消实体,前后分别输入 多加一个danse
|
|
|
+ # model.load_weights("log/ep019-loss0.087-val_loss0.106-f1_score0.968.h5",by_name=True) # 20230425 前后分别输入 中间用公司代替,三输入lstm后合并再次经过lstm
|
|
|
+ # model.load_weights("log/ep004-loss0.069-val_loss0.103-f1_score0.971.h5",by_name=True) # 20230425 前后分别输入 去掉实体,2输入lstm后合并再次经过lstm
|
|
|
+ # model.load_weights("log/20ep045-loss0.140-val_loss0.181-f1_score0.941.h5",by_name=True) # 20230908 前后分别输入 去掉实体,2输入lstm后合并输出
|
|
|
+ # model.load_weights("log/20912ep038-loss0.123-val_loss0.181-f1_score0.947.h5",by_name=True) # 20230908 前后分别输入 去掉实体,2输入lstm后合并输出
|
|
|
+ # model.load_weights("log/ep068-loss0.075-val_loss0.190-f1_score0.941.h5",by_name=True) # 20230908 前后分别输入gru 去掉实体
|
|
|
+ # model.load_weights("log/gruep043-loss0.124-val_loss0.177-f1_score0.947.h5",by_name=True) # 20230908 前后分别输入gru 去掉实体
|
|
|
+ # model.load_weights("log/ep052-loss0.130-val_loss0.216-f1_score0.931.h5",by_name=True) # 20230919 前后分别输入gru 去掉实体 新标注数据+旧数据重新标注
|
|
|
+ model.load_weights("log/ep049-loss0.108-val_loss0.185-f1_score0.938.h5",by_name=True) # 20231008 前后分别输入lstm 去掉实体 最终选择结果
|
|
|
+
|
|
|
+
|
|
|
+ # lg_old = role_old.predict(old_x)
|
|
|
+ # df_val['pred_old'] = pd.DataFrame(np.argmax(lg_old, axis=1))
|
|
|
+ # df_val['prob_old'] = pd.DataFrame(np.amax(lg_old, axis=1))
|
|
|
+
|
|
|
+ # logit = model.predict([val_x[0], val_x[1], val_x[2]])
|
|
|
+ # print('新模型预测结果',logit[:3])
|
|
|
+ # print('旧模型预测结果:',lg_old[:3])
|
|
|
+ # df_val['pred_new'] = pd.DataFrame(np.argmax(logit, axis=-1))
|
|
|
+ # df_val['prob_new'] = pd.DataFrame(np.amax(logit, axis=1))
|
|
|
+ # # df_val['new=new3'] = df_val.apply(lambda x: 1 if x['pred_new3'] == x['pred_new2'] else 0, axis=1)
|
|
|
+ # df_val['new=old'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
|
|
|
+ # df_val['old=lb'] = df_val.apply(lambda x: 1 if x['label'] == x['pred_old'] else 0, axis=1)
|
|
|
+ # df_val['new=lb'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
|
|
|
+
|
|
|
+ # logit = model.predict([val_x])
|
|
|
+ logit = model.predict([val_x[0],val_x[1]])
|
|
|
+ print('新模型预测结果', logit[:3])
|
|
|
+ # df_val['pred_new2'] = df_val['pred_new']
|
|
|
+
|
|
|
+ df_val['pred_new'] = pd.DataFrame(np.argmax(logit, axis=-1))
|
|
|
+ df_val['prob_new'] = pd.DataFrame(np.amax(logit, axis=1))
|
|
|
+ # df_val['new=new2'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['pred_new2'] else 0, axis=1)
|
|
|
+ df_val['new=lb'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['new_label'] else 0, axis=1)
|
|
|
+
|
|
|
+
|
|
|
+ for it in set(df_val['new_label']):
|
|
|
+ df_tmp = df_val[df_val['new_label']==it]
|
|
|
+ lb = len(df_tmp)
|
|
|
+ eq = sum(df_tmp['new=lb'])
|
|
|
+ pr = len(df_val[df_val['pred_new']==it])
|
|
|
+ acc = eq/pr if pr>0 else 0
|
|
|
+ recall = eq/lb if lb>0 else 0
|
|
|
+ f1 = 2*recall*acc/(acc+recall) if (acc+recall)>0 else 0
|
|
|
+ print('类别:%d, acc:%.4f, recall:%.4f, f1: %.4f'%(it, acc, recall, f1))
|
|
|
+
|
|
|
+ print('旧模型:')
|
|
|
+ df_val['old=lb'] = df_val.apply(lambda x: 1 if x['label'] == x['new_label'] else 0, axis=1)
|
|
|
+ for it in set(df_val['label']):
|
|
|
+ df_tmp = df_val[df_val['new_label']==it]
|
|
|
+ lb = len(df_tmp)
|
|
|
+ eq = sum(df_tmp['old=lb'])
|
|
|
+ pr = len(df_val[df_val['label']==it])
|
|
|
+ acc = eq/pr if pr>0 else 0
|
|
|
+ recall = eq/lb if lb>0 else 0
|
|
|
+ f1 = 2*recall*acc/(acc+recall) if (acc+recall)>0 else 0
|
|
|
+ print('类别:%d, acc:%.4f, recall:%.4f, f1: %.4f'%(it, acc, recall, f1))
|
|
|
+
|
|
|
+ # df_val.to_excel('traindata/df_val_predict.xlsx')
|
|
|
+ # df_val.to_excel('traindata/兼职标注数据_test29_predict.xlsx')
|
|
|
+ # df_val.to_excel('traindata/兼职标注数据_test3_predict.xlsx')
|
|
|
+ # df_val.to_excel('traindata/df_test_20230908_predict.xlsx', index=False)
|
|
|
+ # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据_predict.xlsx', index=False)
|
|
|
+ # df_val.to_excel('traindata/旧训练测试数据_筛选数据_predict_重新标注数据20230919.xlsx', index=False)
|
|
|
+ # df_val.to_excel('traindata/旧训练测试数据_筛选数据_predict.xlsx', index=False)
|
|
|
+ # df_val.to_excel('traindata/df_test_20230912_predict.xlsx', index=False)
|
|
|
+ # df_val.to_excel('traindata/df_test_20230912_加补充数据_predict.xlsx', index=False)
|
|
|
+ # df_val.to_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果.xlsx', index=False)
|
|
|
+ # df_val.to_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果_re.xlsx', index=False)
|
|
|
+ # df_val.to_excel('E:\实体识别数据/少于10条关键词补充数据.xlsx', index=False)
|
|
|
+
|
|
|
+ # df_val.to_excel('traindata/所有训练测试数据_add_predict.xlsx', index=False)
|
|
|
+
|
|
|
+ # df_val.to_excel('traindata/所有训练测试数据_test_predict.xlsx', index=False)
|
|
|
+ # df_val.to_excel('traindata/df_train_20230912_predict.xlsx', index=False)
|
|
|
+
|
|
|
+ # df_val = df_val[df_val['new=lb']==0]
|
|
|
+ # for i in df_val.index:
|
|
|
+ # if ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'front'])) or ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'behind'])):
|
|
|
+ # print('过滤异常数据',i ,ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'front'])) or ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'behind'])))
|
|
|
+ # df_val.drop(index=i, inplace=True)
|
|
|
+ # print('不一致数量: ', len(df_val))
|
|
|
+ # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果_重新不一致结果.xlsx', index=False)
|
|
|
+ # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果40000-60000_重新不一致结果.xlsx', index=False)
|
|
|
+ # df_val.to_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据.xlsx', index=False)
|
|
|
+ # df_val.to_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_pred.xlsx', index=False)
|
|
|
+
|
|
|
+ # df_val.to_excel('traindata/角色实体分类新旧数据汇总_predict.xlsx', index=False)
|
|
|
+ # df_val.to_excel('E:/角色金额数据/数据库验证数据原模型识别结果20230926_predict.xlsx', index=False)
|
|
|
+ # df_val.to_excel('E:\角色金额数据/易错角色表达_predict.xlsx', index=False)
|
|
|
+ print('df_val.columns', df_val.columns)
|
|
|
+
|
|
|
+
|
|
|
+'''
|
|
|
+类别:0, acc:0.4199, recall:0.6492, f1: 0.5099
|
|
|
+类别:1, acc:0.5126, recall:0.7846, f1: 0.6201
|
|
|
+类别:2, acc:0.4416, recall:0.6632, f1: 0.5301
|
|
|
+类别:3, acc:0.7455, recall:0.7961, f1: 0.7700
|
|
|
+类别:4, acc:0.7471, recall:0.8553, f1: 0.7975
|
|
|
+类别:5, acc:0.9664, recall:0.9100, f1: 0.9373
|
|
|
+
|
|
|
+类别:0, acc:0.9537, recall:0.9777, f1: 0.9655
|
|
|
+类别:1, acc:0.9589, recall:0.9722, f1: 0.9655
|
|
|
+类别:2, acc:0.9227, recall:0.9502, f1: 0.9363
|
|
|
+类别:3, acc:0.8750, recall:0.9333, f1: 0.9032
|
|
|
+类别:4, acc:0.9643, recall:1.0000, f1: 0.9818
|
|
|
+类别:5, acc:0.9476, recall:0.8690, f1: 0.9066
|
|
|
+
|
|
|
+类别:0, acc:0.9393, recall:0.9319, f1: 0.9356
|
|
|
+类别:1, acc:0.9500, recall:0.9620, f1: 0.9560
|
|
|
+类别:2, acc:0.9156, recall:0.9406, f1: 0.9279
|
|
|
+类别:3, acc:0.8857, recall:0.9394, f1: 0.9118
|
|
|
+类别:4, acc:0.9655, recall:0.9333, f1: 0.9492
|
|
|
+类别:5, acc:0.9102, recall:0.8990, f1: 0.9046
|
|
|
+
|
|
|
+类别:0, acc:0.9357, recall:0.9615, f1: 0.9484
|
|
|
+类别:1, acc:0.9538, recall:0.9483, f1: 0.9510
|
|
|
+类别:2, acc:0.9271, recall:0.9366, f1: 0.9318
|
|
|
+类别:3, acc:0.9600, recall:0.9863, f1: 0.9730
|
|
|
+类别:4, acc:0.9429, recall:0.9851, f1: 0.9635
|
|
|
+类别:5, acc:0.9407, recall:0.9098, f1: 0.9250
|
|
|
+
|
|
|
+类别:0, acc:0.9402, recall:0.9556, f1: 0.9478
|
|
|
+类别:1, acc:0.9593, recall:0.9375, f1: 0.9483
|
|
|
+类别:2, acc:0.9243, recall:0.9412, f1: 0.9327
|
|
|
+类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
|
|
|
+类别:4, acc:0.9452, recall:0.9857, f1: 0.9650
|
|
|
+类别:5, acc:0.9296, recall:0.9058, f1: 0.9176
|
|
|
+
|
|
|
+类别:0, acc:0.9468, recall:0.9568, f1: 0.9518
|
|
|
+类别:1, acc:0.9489, recall:0.9489, f1: 0.9489
|
|
|
+类别:2, acc:0.9388, recall:0.9312, f1: 0.9350
|
|
|
+类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
|
|
|
+类别:4, acc:0.9324, recall:0.9857, f1: 0.9583
|
|
|
+类别:5, acc:0.9316, recall:0.9202, f1: 0.9258
|
|
|
+
|
|
|
+类别:0, acc:0.9455, recall:0.9478, f1: 0.9467
|
|
|
+类别:1, acc:0.9375, recall:0.9538, f1: 0.9456
|
|
|
+类别:2, acc:0.9275, recall:0.9295, f1: 0.9285
|
|
|
+类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
|
|
|
+类别:4, acc:0.9583, recall:0.9857, f1: 0.9718
|
|
|
+类别:5, acc:0.9262, recall:0.9159, f1: 0.9210
|
|
|
+
|
|
|
+类别:0, acc:0.9331, recall:0.9516, f1: 0.9423
|
|
|
+类别:1, acc:0.9524, recall:0.9467, f1: 0.9496
|
|
|
+类别:2, acc:0.9437, recall:0.9089, f1: 0.9260
|
|
|
+类别:3, acc:0.9565, recall:0.9565, f1: 0.9565
|
|
|
+类别:4, acc:0.9242, recall:0.9683, f1: 0.9457
|
|
|
+类别:5, acc:0.9270, recall:0.9261, f1: 0.9266
|
|
|
+
|
|
|
+新模型:
|
|
|
+类别:0, acc:0.9336, recall:0.9225, f1: 0.9280
|
|
|
+类别:1, acc:0.9389, recall:0.9762, f1: 0.9572
|
|
|
+类别:2, acc:0.8937, recall:0.9439, f1: 0.9181
|
|
|
+类别:3, acc:0.9130, recall:1.0000, f1: 0.9545
|
|
|
+类别:4, acc:0.9545, recall:0.8936, f1: 0.9231
|
|
|
+类别:5, acc:0.9445, recall:0.9292, f1: 0.9368
|
|
|
+旧模型:
|
|
|
+类别:0, acc:0.8323, recall:0.7694, f1: 0.7996
|
|
|
+类别:1, acc:0.9565, recall:0.8730, f1: 0.9129
|
|
|
+类别:2, acc:0.8800, recall:0.8491, f1: 0.8643
|
|
|
+类别:3, acc:0.8723, recall:0.9762, f1: 0.9213
|
|
|
+类别:4, acc:0.9778, recall:0.9362, f1: 0.9565
|
|
|
+类别:5, acc:0.8402, recall:0.8878, f1: 0.8633
|
|
|
+'''
|
|
|
+
|
|
|
+def get_savedModel():
|
|
|
+ sess = tf.Session(graph=tf.Graph())
|
|
|
+ with sess.as_default():
|
|
|
+ with sess.graph.as_default():
|
|
|
+ vocab, matrix = getVocabAndMatrix(getModel_word())
|
|
|
+ model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
|
|
|
+ sess.run(tf.global_variables_initializer())
|
|
|
+ # model.load_weights(filepath="log/ep009-loss0.057-val_loss0.076-f1_score0.978.h5")
|
|
|
+ # model.load_weights(filepath="log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5") #7月30日训练最优模型20字
|
|
|
+ # model.load_weights(filepath="../../dl_dev/role/log/ep015-loss0.090-val_loss0.113-f1_score0.967.h5") #8月5日调整部分招标人标注后重新训练结果20字
|
|
|
+ # model.load_weights("log/ep004-loss0.069-val_loss0.103-f1_score0.971.h5", # 20230427
|
|
|
+ # model.load_weights("log/ep059-loss0.096-val_loss0.180-f1_score0.945.h5", # 20231008 重新整理标注数据后结果
|
|
|
+ # model.load_weights("log/ep059-loss0.101-val_loss0.191-f1_score0.940.h5", # 20231012 重新整理标注数据后结果
|
|
|
+ # model.load_weights("log/ep052-loss0.123-val_loss0.194-f1_score0.937.h5", # 20231012 重新整理标注数据后结果
|
|
|
+ model.load_weights("log/ep049-loss0.108-val_loss0.185-f1_score0.938.h5", # 20231026 重新整理标注数据后结果
|
|
|
+ by_name=True) # 20230425 前后分别输入 去掉实体,2输入lstm后合并再次经过lstm 2023/04/27
|
|
|
+ tf.saved_model.simple_save(session=sess,
|
|
|
+ export_dir="role_savedmodel2023-10-26", # role_savedmodel2021-8-5
|
|
|
+ inputs={"input0": model.input[0],
|
|
|
+ "input1": model.input[1],
|
|
|
+ }, #"input2": model.input[2]
|
|
|
+ outputs={"outputs": model.output})
|
|
|
+
|
|
|
+def predict_pb():
|
|
|
+ # df_val = pd.read_excel('traindata/df_val.xlsx')
|
|
|
+ df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
|
|
|
+ old_x, old_y = word2id(df_val, seq_len=seq_len)
|
|
|
+ # old_x = np.transpose(np.array(old_x), (1, 0, 2))
|
|
|
+
|
|
|
+ sess_role = tf.Session()
|
|
|
+ with sess_role.as_default() as sess:
|
|
|
+ with sess_role.graph.as_default():
|
|
|
+ meta_graph_def = tf.saved_model.loader.load(sess=sess_role, tags=["serve"],
|
|
|
+ export_dir="role_savedmodel2023-10-08") # role_savedmodel2021-8-5 role_savedmodel2023-04-27
|
|
|
+ signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
|
|
|
+ signature_def = meta_graph_def.signature_def
|
|
|
+
|
|
|
+ input0 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)
|
|
|
+ input1 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)
|
|
|
+ # input2 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
|
|
|
+ output = sess_role.graph.get_tensor_by_name(
|
|
|
+ signature_def[signature_key].outputs["outputs"].name)
|
|
|
+ model_role = [[input0, input1], output] #, input2
|
|
|
+ lg_old = sess_role.run(output, feed_dict={input0:old_x[0],
|
|
|
+ input1:old_x[1],
|
|
|
+ }) # input2:old_x[2]
|
|
|
+ print(lg_old[:3])
|
|
|
+ pos = neg = 0
|
|
|
+ for i in range(len(lg_old)):
|
|
|
+ if np.argmax(lg_old[i]) != np.argmax(old_y[i]):
|
|
|
+ print(np.argmax(lg_old[i]) , np.argmax(old_y[i]))
|
|
|
+ neg += 1
|
|
|
+ else:
|
|
|
+ pos += 1
|
|
|
+ print(pos, neg, pos/(pos+neg))
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ # train()
|
|
|
+ test()
|
|
|
+ # get_savedModel()
|
|
|
+ # predict_pb()
|
|
|
+
|
|
|
+ # import tensorflow as tf
|
|
|
+ #
|
|
|
+ # # X = tf.constant([[[1, 1, 1], [2, 2, 2]],
|
|
|
+ # # [[3, 3, 3], [4, 4, 4]],
|
|
|
+ # # [[5, 5, 5], [6, 6, 6]]])
|
|
|
+ # X = tf.constant([[1, 1, 1], [2, 2, 2]]
|
|
|
+ # )
|
|
|
+ # print(X.shape)
|
|
|
+ # rs = tf.slice(X, [0, 0], [1, -1])
|
|
|
+ # with tf.Session() as sess:
|
|
|
+ # print(sess.run(rs))
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|