#!/usr/bin/python3 # -*- coding: utf-8 -*- # @Author : bidikeji # @Time : 2021/7/28 0028 11:32 import os # os.environ["CUDA_VISIBLE_DEVICES"] = "-1" import sys sys.path.append(os.path.abspath("../../..")) import pandas as pd from BiddingKG.dl.interface.modelFactory import Model_role_classify_word from BiddingKG.dl.common.Utils import * import tensorflow as tf import tensorflow.keras.backend as K # from tensorflow.keras import layers, models,optimizers,losses,callbacks from keras import layers, models,optimizers,losses,callbacks # import keras.backend as K # from keras.models import Model from keras.engine.topology import Layer from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE def recall(y_true, y_pred): ''' 计算召回率 @Argus: y_true: 正确的标签 y_pred: 模型预测的标签 @Return 召回率 ''' c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) c3 = K.sum(K.round(K.clip(y_true, 0, 1))) if c3 == 0: return 0 recall = c1 / c3 return recall def f1_score(y_true, y_pred): ''' 计算F1 @Argus: y_true: 正确的标签 y_pred: 模型预测的标签 @Return F1值 ''' c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) c2 = K.sum(K.round(K.clip(y_pred, 0, 1))) c3 = K.sum(K.round(K.clip(y_true, 0, 1))) precision = c1 / c2 if c3 == 0: recall = 0 else: recall = c1 / c3 f1_score = 2 * (precision * recall) / (precision + recall) return f1_score def precision(y_true, y_pred): ''' 计算精确率 @Argus: y_true: 正确的标签 y_pred: 模型预测的标签 @Return 精确率 ''' c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) c2 = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = c1 / c2 return precision seq_len = 30 # 20 sp = 30 lb2id = {'招标人':0, '代理人':1, '中标人':2, '第二候选人':3, '第三候选人':4, '其他角色':5} def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False): # assert len(input_shape)==3 list_input = [] for i in range(input_shape[0]): list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i))) list_embedding = [] embedding_input = list_input embedding = layers.Embedding(len(vocab),input_shape[2], weights=[embedding_weights] if embedding_weights is not None else None, mask_zero=True,trainable=True,name="char_embeding") for i in range(len(embedding_input)): list_embedding.append(embedding(embedding_input[i])) list_w2v = list_embedding list_lstm = [] # list_lstm.append(layers.Bidirectional(layers.GRU(60, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[0])) #dropout=0.5, recurrent_dropout=0.5 # list_lstm.append(layers.Bidirectional(layers.GRU(60, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[1])) list_lstm.append(layers.Bidirectional(layers.LSTM(120, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[0])) #dropout=0.5, recurrent_dropout=0.5 list_lstm.append(layers.Bidirectional(layers.LSTM(120, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[1])) concat = layers.concatenate(list_lstm, axis=1) out = layers.Dense(classes,activation="softmax")(concat) model = models.Model(list_input,out) model.compile(optimizer=optimizers.Adam(lr=0.001),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score]) model.summary() return model def labeling(label, out_len=6): out = np.zeros((out_len)) out[label] = 1 return out def word2id(df, seq_len=seq_len, is_test=False): train_x = [] train_y = [] test_x = [] test_y = [] # print(set(df['label'])) # print(set(lb2id)) # if set(df['label']) == set(lb2id): # df['label'] = df['label'].apply(lambda x:lb2id[x]) for before, text, after, label in zip(df["front20"], df["entity_text"], df["behind20"], df["new_label"]): before = before if isinstance(before, str) else "" text = text if isinstance(text, str) else "" after = after if isinstance(after, str) else "" b = before.find('。') if b!=-1: # 分句看不到前面句子 before = before[b+1:] e = after.find('。') if e!=-1: after = after[:e+1] x = encodeInput([before, after], word_len=seq_len, word_flag=True, userFool=False) if is_test: y = label else: y = labeling(label) train_x.append(x) train_y.append(y) return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y) def fix_digit_eng(text): ''' 处理数字及英文编号等 :param text: :return: ''' text = re.sub('第[一二三1-3]([条项章]|中学|医院|附属)|第三方(服务机构)?', 'xxx', text) text = re.sub('第01(中标|成交)?候选人', '第一中标候选人', text) text = re.sub('标段[一二三1-3]', '标段d', text) text = re.sub('第[一二三1-3](标段?|[分子标]?包)', 'd标段', text) text = re.sub('[a-zA-Z][a-zA-Z0-9=&_—-]{3,}', 'abc', text) text = re.sub('[【(\[][0-9]{2,}[\])】]|\d+([::.-]\d+)+', 'd', text) text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text) text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text) text = re.sub('序号:\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、', '', text) return text.replace('(', '(').replace(')', ')') def train(): # df_train = pd.read_excel('traindata/df_train_20230908.xlsx') # df_test = pd.read_excel('traindata/df_test_20230908.xlsx') # df_train = pd.read_excel('traindata/df_train_20230912.xlsx') # df_test = pd.read_excel('traindata/df_test_20230912.xlsx') # df_train = pd.read_excel('traindata/df_train_20230912_predict.xlsx') # df_test = pd.read_excel('traindata/df_test_20230912_predict.xlsx') # df_train = pd.read_excel('traindata/df_train_20230912_2.xlsx') # df_test = pd.read_excel('traindata/df_test_20230912_2.xlsx') # df1 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_train.xlsx') # df2 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_test.xlsx') # df3 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_train.xlsx') # df4 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_test.xlsx') # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据') # df_train = df_train.append([df1,df3, df, df, df, df], ignore_index=True) # df_test = df_test.append([df2,df4, df], ignore_index=True) df_train = pd.read_excel('traindata/所有训练测试数据_train.xlsx') df_test = pd.read_excel('traindata/所有训练测试数据_test.xlsx') df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据') df_train = df_train.append([df, df, df, df], ignore_index=True) df_test = df_test.append([df], ignore_index=True) df_train = df_train.sample(frac=1) df_test = df_test.sample(frac=1) df_train['front20'] = df_train['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:])) df_train['behind20'] = df_train['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len])) df_test['front20'] = df_test['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:])) df_test['behind20'] = df_test['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len])) # df_train['front20'] = df_train['front'].apply(lambda x: str(x)[-seq_len:]) # df_train['behind20'] = df_train['behind'].apply(lambda x: str(x)[:seq_len]) # df_test['front20'] = df_test['front'].apply(lambda x: str(x)[-seq_len:]) # df_test['behind20'] = df_test['behind'].apply(lambda x: str(x)[:seq_len]) df_train.fillna("", inplace=True) df_test.fillna("", inplace=True) if 'relabel' in df_train.columns: df_train['new_label'] = df_train.apply(lambda x: int(x['relabel']) if x['relabel'] !="" else int(x['new_label']), axis=1) if 'relabel' in df_test.columns: df_test['new_label'] = df_test.apply(lambda x: int(x['relabel']) if x['relabel'] !="" else int(x['new_label']), axis=1) print('df_train', set(df_train['new_label']), set(df_train['relabel'])) print('df_test', set(df_test['new_label']), set(df_test['relabel'])) df_train = df_train[df_train['new_label'].isin([0,1,2,3,4,5])] df_test = df_test[df_test['new_label'].isin([0,1,2,3,4,5])] print('训练数据:%d,测试数据:%d'%(len(df_train), len(df_test))) print(set(df_train['new_label']), set(lb2id.values())) assert set(df_train['new_label'])==set(lb2id.values()) train_x, train_y = word2id(df_train) print('train_x.shape', train_x.shape) print('train_y.shape', train_y.shape) print('train_x: ', train_x[0]) test_x, test_y = word2id(df_test) with tf.Session() as sess: vocab, matrix = getVocabAndMatrix(getModel_word()) model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6) print("loading weights") # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True) # model.load_weights("log/ep008-loss0.103-val_loss0.109-f1_score0.970.h5",by_name=True) # 加 多一个dense # model.load_weights("log/ep021-loss0.078-val_loss0.104-f1_score0.969.h5",by_name=True) # 加 多一个lstm连接前后lstm输出 callback = callbacks.ModelCheckpoint( filepath="log/" + "ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5", monitor="val_loss", save_best_only=True, save_weights_only=True, mode="min") model.fit(x=[train_x[0],train_x[1]], y=train_y, batch_size=512, epochs=100, callbacks=[callback], validation_data=[[test_x[0],test_x[1]], test_y]) def test(): # df_val = pd.read_excel('traindata/df_test_20230908.xlsx') # df_val = pd.read_excel('traindata/df_test_20230908_predict.xlsx') # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据.xlsx') # df_val = pd.read_excel('traindata/df_test_20230912_2.xlsx') # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx') # df_val = pd.read_excel('traindata/df_train_20230912_2.xlsx') # df_val = pd.read_excel('traindata/角色实体分类新旧数据汇总.xlsx') # df_val = pd.read_excel('E:/角色金额数据/数据库验证数据原模型识别结果20230926.xlsx') # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx') # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果.xlsx') # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果60000-90000.xlsx') # df_val = pd.read_excel('traindata/2023-08-24所有公告_重新预测结果40000-60000_重新不一致结果.xlsx') # df_val = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='不确定角色表达') # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_筛选前后文不同的数据.xlsx') # df_val = pd.read_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果.xlsx') # df_val = pd.read_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果_re.xlsx') # df_val = pd.read_excel('E:\实体识别数据/少于10条关键词补充数据.xlsx') # df_val = pd.read_excel('traindata/所有训练测试数据_add.xlsx') df_val = pd.read_excel('traindata/所有训练测试数据_test.xlsx') # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据') # df_val = df_val.append([df], ignore_index=True) # df_val = df_val[['entity_id', 'docid', 'label', 'front', 'entity_text', 'behind', # 'new_label', 'relabel', 'kws', 'new_old', 'front20', 'behind20', # 'front_reverse', 'pred_new', 'prob_new', 'new=lb']] # df_val = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据.xlsx') # df_val = pd.read_excel('traindata/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据_predict.xlsx') # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据_predict_重新标注数据20230919.xlsx') lb2id = {'招标人': 0, '代理人': 1, '中标人': 2, '第二候选人': 3, '第三候选人': 4, '其他角色': 5} # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据.xlsx') # df_val['label'] = df_val.apply(lambda x: x['relabel'] if x['relabel']!='' else x['label'], axis=1) # df_val['new_label'] = df_val['label'].apply(lambda x: lb2id[x]) # df_val['label'] = df_val['label'].apply(lambda x: lb2id[x]) # df_val['relabel'] = df_val['relabel'].apply(lambda x: lb2id.get(x, '')) # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据_predict.xlsx') # df_val.fillna('', inplace=True) # print('测试公告数量:', len(df_val), set(df_val['new_label'])) # df_val['new_label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] != '' else x['new_label'], axis=1) # # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx') # df_val = pd.read_excel('traindata/df_test_20230912_2.xlsx') # print(df_val.columns) # df2 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_test.xlsx') # df4 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_test.xlsx') # # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据') # print(df2.columns) # df_val = df_val.append([df2, df4], ignore_index=True) # df_val = df_val[['entity_id', 'docid', 'label', 'front', 'entity_text', 'behind', # 'new_label', 'relabel', 'kws', 'new_old', 'front20', 'behind20', # 'front_reverse', 'pred_new', 'prob_new', 'new=lb']] df_val.fillna('', inplace=True) # df_val = df_val[df_val['relabel']!=6] # for i in df_val.index: # b = df_val.loc[i, 'front'] # e = df_val.loc[i, 'behind'] # if not isinstance(b, str): # print('异常数据', i, type(b)) # if not isinstance(e, str): # print('异常数据', i, type(e)) if 'new_label' in df_val.columns: if 'relabel' in df_val.columns: df_val['new_label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] in [0,1,2,3,4,5] else x['new_label'], axis=1) else: df_val['new_label'] = df_val['label'] # df_val['new_label'] = df_val['new_label'].apply(lambda x: x if x in [0, 1, 2, 3, 4, 5] else 5) # df_val = df_val[df_val['new_label'].isin([0,1,2,3,4,5])] print('测试公告数量:', len(df_val), set(df_val['new_label'])) df_val['new_label'] = df_val['new_label'].apply(lambda x: int(x)) df_val['front20'] = df_val['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:])) df_val['behind20'] = df_val['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len])) # df_val.drop_duplicates(subset=['front20', 'behind20'], inplace=True) # print('测试公告去重后数量:', len(df_val)) # df_val['front20'] = df_val['front'].apply(lambda x: str(x)[-seq_len:]) # df_val['behind20'] = df_val['behind'].apply(lambda x: str(x)[:seq_len]) df_val['front_reverse'] = df_val['front20'].apply(lambda x: x[-6:][::-1]) # df_val['label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] !="" else x['label'], axis=1) # df_val['label'] = df_val['label'].apply(lambda x:lb2id[x] if x in lb2id else x) df_val.reset_index(drop=True, inplace=True) val_x, val_y = word2id(df_val, seq_len=seq_len, is_test=True) # val_x = np.transpose(np.array(train_x), (1, 0, 2)) # old_x, old_y = word2id(df_val, seq_len=50) # old_x = np.transpose(np.array(old_x), (1, 0, 2)) role_old = Model_role_classify_word() with tf.Session() as sess: vocab, matrix = getVocabAndMatrix(getModel_word()) model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6) print("loading weights") # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True) # model.load_weights("log/ep006-loss0.174-val_loss0.234-f1_score0.917.h5",by_name=True, skip_mismatch=True) # model.load_weights("log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5",by_name=True, skip_mismatch=True) # model.load_weights("log/ep014-loss0.091-val_loss0.110-f1_score0.968.h5",by_name=True, skip_mismatch=True) # model.load_weights("log/ep008-loss0.162-val_loss0.173-f1_score0.947.h5",by_name=True) # 20230425 取消实体,合并前后输入 效果不佳,招标代理分不清,特别是 受。。。委托这种 # model.load_weights("log/ep009-loss0.104-val_loss0.115-f1_score0.966.h5",by_name=True) # 20230425 取消实体,前后分别输入 # model.load_weights("log/ep008-loss0.103-val_loss0.109-f1_score0.970.h5",by_name=True) # 20230425 取消实体,前后分别输入 多加一个danse # model.load_weights("log/ep019-loss0.087-val_loss0.106-f1_score0.968.h5",by_name=True) # 20230425 前后分别输入 中间用公司代替,三输入lstm后合并再次经过lstm # model.load_weights("log/ep004-loss0.069-val_loss0.103-f1_score0.971.h5",by_name=True) # 20230425 前后分别输入 去掉实体,2输入lstm后合并再次经过lstm # model.load_weights("log/20ep045-loss0.140-val_loss0.181-f1_score0.941.h5",by_name=True) # 20230908 前后分别输入 去掉实体,2输入lstm后合并输出 # model.load_weights("log/20912ep038-loss0.123-val_loss0.181-f1_score0.947.h5",by_name=True) # 20230908 前后分别输入 去掉实体,2输入lstm后合并输出 # model.load_weights("log/ep068-loss0.075-val_loss0.190-f1_score0.941.h5",by_name=True) # 20230908 前后分别输入gru 去掉实体 # model.load_weights("log/gruep043-loss0.124-val_loss0.177-f1_score0.947.h5",by_name=True) # 20230908 前后分别输入gru 去掉实体 # model.load_weights("log/ep052-loss0.130-val_loss0.216-f1_score0.931.h5",by_name=True) # 20230919 前后分别输入gru 去掉实体 新标注数据+旧数据重新标注 model.load_weights("log/ep049-loss0.108-val_loss0.185-f1_score0.938.h5",by_name=True) # 20231008 前后分别输入lstm 去掉实体 最终选择结果 # lg_old = role_old.predict(old_x) # df_val['pred_old'] = pd.DataFrame(np.argmax(lg_old, axis=1)) # df_val['prob_old'] = pd.DataFrame(np.amax(lg_old, axis=1)) # logit = model.predict([val_x[0], val_x[1], val_x[2]]) # print('新模型预测结果',logit[:3]) # print('旧模型预测结果:',lg_old[:3]) # df_val['pred_new'] = pd.DataFrame(np.argmax(logit, axis=-1)) # df_val['prob_new'] = pd.DataFrame(np.amax(logit, axis=1)) # # df_val['new=new3'] = df_val.apply(lambda x: 1 if x['pred_new3'] == x['pred_new2'] else 0, axis=1) # df_val['new=old'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1) # df_val['old=lb'] = df_val.apply(lambda x: 1 if x['label'] == x['pred_old'] else 0, axis=1) # df_val['new=lb'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1) # logit = model.predict([val_x]) logit = model.predict([val_x[0],val_x[1]]) print('新模型预测结果', logit[:3]) # df_val['pred_new2'] = df_val['pred_new'] df_val['pred_new'] = pd.DataFrame(np.argmax(logit, axis=-1)) df_val['prob_new'] = pd.DataFrame(np.amax(logit, axis=1)) # df_val['new=new2'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['pred_new2'] else 0, axis=1) df_val['new=lb'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['new_label'] else 0, axis=1) for it in set(df_val['new_label']): df_tmp = df_val[df_val['new_label']==it] lb = len(df_tmp) eq = sum(df_tmp['new=lb']) pr = len(df_val[df_val['pred_new']==it]) acc = eq/pr if pr>0 else 0 recall = eq/lb if lb>0 else 0 f1 = 2*recall*acc/(acc+recall) if (acc+recall)>0 else 0 print('类别:%d, acc:%.4f, recall:%.4f, f1: %.4f'%(it, acc, recall, f1)) print('旧模型:') df_val['old=lb'] = df_val.apply(lambda x: 1 if x['label'] == x['new_label'] else 0, axis=1) for it in set(df_val['label']): df_tmp = df_val[df_val['new_label']==it] lb = len(df_tmp) eq = sum(df_tmp['old=lb']) pr = len(df_val[df_val['label']==it]) acc = eq/pr if pr>0 else 0 recall = eq/lb if lb>0 else 0 f1 = 2*recall*acc/(acc+recall) if (acc+recall)>0 else 0 print('类别:%d, acc:%.4f, recall:%.4f, f1: %.4f'%(it, acc, recall, f1)) # df_val.to_excel('traindata/df_val_predict.xlsx') # df_val.to_excel('traindata/兼职标注数据_test29_predict.xlsx') # df_val.to_excel('traindata/兼职标注数据_test3_predict.xlsx') # df_val.to_excel('traindata/df_test_20230908_predict.xlsx', index=False) # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据_predict.xlsx', index=False) # df_val.to_excel('traindata/旧训练测试数据_筛选数据_predict_重新标注数据20230919.xlsx', index=False) # df_val.to_excel('traindata/旧训练测试数据_筛选数据_predict.xlsx', index=False) # df_val.to_excel('traindata/df_test_20230912_predict.xlsx', index=False) # df_val.to_excel('traindata/df_test_20230912_加补充数据_predict.xlsx', index=False) # df_val.to_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果.xlsx', index=False) # df_val.to_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果_re.xlsx', index=False) # df_val.to_excel('E:\实体识别数据/少于10条关键词补充数据.xlsx', index=False) # df_val.to_excel('traindata/所有训练测试数据_add_predict.xlsx', index=False) # df_val.to_excel('traindata/所有训练测试数据_test_predict.xlsx', index=False) # df_val.to_excel('traindata/df_train_20230912_predict.xlsx', index=False) # df_val = df_val[df_val['new=lb']==0] # for i in df_val.index: # if ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'front'])) or ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'behind'])): # print('过滤异常数据',i ,ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'front'])) or ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'behind']))) # df_val.drop(index=i, inplace=True) # print('不一致数量: ', len(df_val)) # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果_重新不一致结果.xlsx', index=False) # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果40000-60000_重新不一致结果.xlsx', index=False) # df_val.to_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据.xlsx', index=False) # df_val.to_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_pred.xlsx', index=False) # df_val.to_excel('traindata/角色实体分类新旧数据汇总_predict.xlsx', index=False) # df_val.to_excel('E:/角色金额数据/数据库验证数据原模型识别结果20230926_predict.xlsx', index=False) # df_val.to_excel('E:\角色金额数据/易错角色表达_predict.xlsx', index=False) print('df_val.columns', df_val.columns) ''' 类别:0, acc:0.4199, recall:0.6492, f1: 0.5099 类别:1, acc:0.5126, recall:0.7846, f1: 0.6201 类别:2, acc:0.4416, recall:0.6632, f1: 0.5301 类别:3, acc:0.7455, recall:0.7961, f1: 0.7700 类别:4, acc:0.7471, recall:0.8553, f1: 0.7975 类别:5, acc:0.9664, recall:0.9100, f1: 0.9373 类别:0, acc:0.9537, recall:0.9777, f1: 0.9655 类别:1, acc:0.9589, recall:0.9722, f1: 0.9655 类别:2, acc:0.9227, recall:0.9502, f1: 0.9363 类别:3, acc:0.8750, recall:0.9333, f1: 0.9032 类别:4, acc:0.9643, recall:1.0000, f1: 0.9818 类别:5, acc:0.9476, recall:0.8690, f1: 0.9066 类别:0, acc:0.9393, recall:0.9319, f1: 0.9356 类别:1, acc:0.9500, recall:0.9620, f1: 0.9560 类别:2, acc:0.9156, recall:0.9406, f1: 0.9279 类别:3, acc:0.8857, recall:0.9394, f1: 0.9118 类别:4, acc:0.9655, recall:0.9333, f1: 0.9492 类别:5, acc:0.9102, recall:0.8990, f1: 0.9046 类别:0, acc:0.9357, recall:0.9615, f1: 0.9484 类别:1, acc:0.9538, recall:0.9483, f1: 0.9510 类别:2, acc:0.9271, recall:0.9366, f1: 0.9318 类别:3, acc:0.9600, recall:0.9863, f1: 0.9730 类别:4, acc:0.9429, recall:0.9851, f1: 0.9635 类别:5, acc:0.9407, recall:0.9098, f1: 0.9250 类别:0, acc:0.9402, recall:0.9556, f1: 0.9478 类别:1, acc:0.9593, recall:0.9375, f1: 0.9483 类别:2, acc:0.9243, recall:0.9412, f1: 0.9327 类别:3, acc:0.9500, recall:0.9870, f1: 0.9682 类别:4, acc:0.9452, recall:0.9857, f1: 0.9650 类别:5, acc:0.9296, recall:0.9058, f1: 0.9176 类别:0, acc:0.9468, recall:0.9568, f1: 0.9518 类别:1, acc:0.9489, recall:0.9489, f1: 0.9489 类别:2, acc:0.9388, recall:0.9312, f1: 0.9350 类别:3, acc:0.9500, recall:0.9870, f1: 0.9682 类别:4, acc:0.9324, recall:0.9857, f1: 0.9583 类别:5, acc:0.9316, recall:0.9202, f1: 0.9258 类别:0, acc:0.9455, recall:0.9478, f1: 0.9467 类别:1, acc:0.9375, recall:0.9538, f1: 0.9456 类别:2, acc:0.9275, recall:0.9295, f1: 0.9285 类别:3, acc:0.9500, recall:0.9870, f1: 0.9682 类别:4, acc:0.9583, recall:0.9857, f1: 0.9718 类别:5, acc:0.9262, recall:0.9159, f1: 0.9210 类别:0, acc:0.9331, recall:0.9516, f1: 0.9423 类别:1, acc:0.9524, recall:0.9467, f1: 0.9496 类别:2, acc:0.9437, recall:0.9089, f1: 0.9260 类别:3, acc:0.9565, recall:0.9565, f1: 0.9565 类别:4, acc:0.9242, recall:0.9683, f1: 0.9457 类别:5, acc:0.9270, recall:0.9261, f1: 0.9266 新模型: 类别:0, acc:0.9336, recall:0.9225, f1: 0.9280 类别:1, acc:0.9389, recall:0.9762, f1: 0.9572 类别:2, acc:0.8937, recall:0.9439, f1: 0.9181 类别:3, acc:0.9130, recall:1.0000, f1: 0.9545 类别:4, acc:0.9545, recall:0.8936, f1: 0.9231 类别:5, acc:0.9445, recall:0.9292, f1: 0.9368 旧模型: 类别:0, acc:0.8323, recall:0.7694, f1: 0.7996 类别:1, acc:0.9565, recall:0.8730, f1: 0.9129 类别:2, acc:0.8800, recall:0.8491, f1: 0.8643 类别:3, acc:0.8723, recall:0.9762, f1: 0.9213 类别:4, acc:0.9778, recall:0.9362, f1: 0.9565 类别:5, acc:0.8402, recall:0.8878, f1: 0.8633 ''' def get_savedModel(): sess = tf.Session(graph=tf.Graph()) with sess.as_default(): with sess.graph.as_default(): vocab, matrix = getVocabAndMatrix(getModel_word()) model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6) sess.run(tf.global_variables_initializer()) # model.load_weights(filepath="log/ep009-loss0.057-val_loss0.076-f1_score0.978.h5") # model.load_weights(filepath="log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5") #7月30日训练最优模型20字 # model.load_weights(filepath="../../dl_dev/role/log/ep015-loss0.090-val_loss0.113-f1_score0.967.h5") #8月5日调整部分招标人标注后重新训练结果20字 # model.load_weights("log/ep004-loss0.069-val_loss0.103-f1_score0.971.h5", # 20230427 # model.load_weights("log/ep059-loss0.096-val_loss0.180-f1_score0.945.h5", # 20231008 重新整理标注数据后结果 # model.load_weights("log/ep059-loss0.101-val_loss0.191-f1_score0.940.h5", # 20231012 重新整理标注数据后结果 # model.load_weights("log/ep052-loss0.123-val_loss0.194-f1_score0.937.h5", # 20231012 重新整理标注数据后结果 model.load_weights("log/ep049-loss0.108-val_loss0.185-f1_score0.938.h5", # 20231026 重新整理标注数据后结果 by_name=True) # 20230425 前后分别输入 去掉实体,2输入lstm后合并再次经过lstm 2023/04/27 tf.saved_model.simple_save(session=sess, export_dir="role_savedmodel2023-10-26", # role_savedmodel2021-8-5 inputs={"input0": model.input[0], "input1": model.input[1], }, #"input2": model.input[2] outputs={"outputs": model.output}) def predict_pb(): # df_val = pd.read_excel('traindata/df_val.xlsx') df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx') old_x, old_y = word2id(df_val, seq_len=seq_len) # old_x = np.transpose(np.array(old_x), (1, 0, 2)) sess_role = tf.Session() with sess_role.as_default() as sess: with sess_role.graph.as_default(): meta_graph_def = tf.saved_model.loader.load(sess=sess_role, tags=["serve"], export_dir="role_savedmodel2023-10-08") # role_savedmodel2021-8-5 role_savedmodel2023-04-27 signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def input0 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name) input1 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name) # input2 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name) output = sess_role.graph.get_tensor_by_name( signature_def[signature_key].outputs["outputs"].name) model_role = [[input0, input1], output] #, input2 lg_old = sess_role.run(output, feed_dict={input0:old_x[0], input1:old_x[1], }) # input2:old_x[2] print(lg_old[:3]) pos = neg = 0 for i in range(len(lg_old)): if np.argmax(lg_old[i]) != np.argmax(old_y[i]): print(np.argmax(lg_old[i]) , np.argmax(old_y[i])) neg += 1 else: pos += 1 print(pos, neg, pos/(pos+neg)) if __name__ == "__main__": # train() test() # get_savedModel() # predict_pb() # import tensorflow as tf # # # X = tf.constant([[[1, 1, 1], [2, 2, 2]], # # [[3, 3, 3], [4, 4, 4]], # # [[5, 5, 5], [6, 6, 6]]]) # X = tf.constant([[1, 1, 1], [2, 2, 2]] # ) # print(X.shape) # rs = tf.slice(X, [0, 0], [1, -1]) # with tf.Session() as sess: # print(sess.run(rs))