123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619 |
- #!/usr/bin/python3
- # -*- coding: utf-8 -*-
- # @Author : bidikeji
- # @Time : 2021/7/28 0028 11:32
- import os
- # os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
- import sys
- sys.path.append(os.path.abspath("../../.."))
- import pandas as pd
- from BiddingKG.dl.interface.modelFactory import Model_role_classify_word
- from BiddingKG.dl.common.Utils import *
- import tensorflow as tf
- import tensorflow.keras.backend as K
- # from tensorflow.keras import layers, models,optimizers,losses,callbacks
- from keras import layers, models,optimizers,losses,callbacks
- # import keras.backend as K
- # from keras.models import Model
- from keras.engine.topology import Layer
- from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
- def recall(y_true, y_pred):
- '''
- 计算召回率
- @Argus:
- y_true: 正确的标签
- y_pred: 模型预测的标签
- @Return
- 召回率
- '''
- c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
- c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
- if c3 == 0:
- return 0
- recall = c1 / c3
- return recall
- def f1_score(y_true, y_pred):
- '''
- 计算F1
- @Argus:
- y_true: 正确的标签
- y_pred: 模型预测的标签
- @Return
- F1值
- '''
- c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
- c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
- c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
- precision = c1 / c2
- if c3 == 0:
- recall = 0
- else:
- recall = c1 / c3
- f1_score = 2 * (precision * recall) / (precision + recall)
- return f1_score
- def precision(y_true, y_pred):
- '''
- 计算精确率
- @Argus:
- y_true: 正确的标签
- y_pred: 模型预测的标签
- @Return
- 精确率
- '''
- c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
- c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
- precision = c1 / c2
- return precision
- seq_len = 30 # 20
- sp = 30
- lb2id = {'招标人':0,
- '代理人':1,
- '中标人':2,
- '第二候选人':3,
- '第三候选人':4,
- '其他角色':5}
- def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
- # assert len(input_shape)==3
- list_input = []
- for i in range(input_shape[0]):
- list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
- list_embedding = []
- embedding_input = list_input
- embedding = layers.Embedding(len(vocab),input_shape[2],
- weights=[embedding_weights] if embedding_weights is not None else None,
- mask_zero=True,trainable=True,name="char_embeding")
- for i in range(len(embedding_input)):
- list_embedding.append(embedding(embedding_input[i]))
- list_w2v = list_embedding
- list_lstm = []
- # list_lstm.append(layers.Bidirectional(layers.GRU(60, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[0])) #dropout=0.5, recurrent_dropout=0.5
- # list_lstm.append(layers.Bidirectional(layers.GRU(60, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[1]))
- list_lstm.append(layers.Bidirectional(layers.LSTM(120, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[0])) #dropout=0.5, recurrent_dropout=0.5
- list_lstm.append(layers.Bidirectional(layers.LSTM(120, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[1]))
- concat = layers.concatenate(list_lstm, axis=1)
- out = layers.Dense(classes,activation="softmax")(concat)
- model = models.Model(list_input,out)
- model.compile(optimizer=optimizers.Adam(lr=0.001),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
- model.summary()
- return model
- def labeling(label, out_len=6):
- out = np.zeros((out_len))
- out[label] = 1
- return out
- def word2id(df, seq_len=seq_len, is_test=False):
- train_x = []
- train_y = []
- test_x = []
- test_y = []
- # print(set(df['label']))
- # print(set(lb2id))
- # if set(df['label']) == set(lb2id):
- # df['label'] = df['label'].apply(lambda x:lb2id[x])
- for before, text, after, label in zip(df["front20"], df["entity_text"], df["behind20"], df["new_label"]):
- before = before if isinstance(before, str) else ""
- text = text if isinstance(text, str) else ""
- after = after if isinstance(after, str) else ""
- b = before.find('。')
- if b!=-1: # 分句看不到前面句子
- before = before[b+1:]
- e = after.find('。')
- if e!=-1:
- after = after[:e+1]
- x = encodeInput([before, after], word_len=seq_len, word_flag=True, userFool=False)
- if is_test:
- y = label
- else:
- y = labeling(label)
- train_x.append(x)
- train_y.append(y)
- return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y)
- def fix_digit_eng(text):
- '''
- 处理数字及英文编号等
- :param text:
- :return:
- '''
- text = re.sub('第[一二三1-3]([条项章]|中学|医院|附属)|第三方(服务机构)?', 'xxx', text)
- text = re.sub('第01(中标|成交)?候选人', '第一中标候选人', text)
- text = re.sub('标段[一二三1-3]', '标段d', text)
- text = re.sub('第[一二三1-3](标段?|[分子标]?包)', 'd标段', text)
- text = re.sub('[a-zA-Z][a-zA-Z0-9=&_—-]{3,}', 'abc', text)
- text = re.sub('[【(\[][0-9]{2,}[\])】]|\d+([::.-]\d+)+', 'd', text)
- text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)
- text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text)
- text = re.sub('序号:\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、', '', text)
- return text.replace('(', '(').replace(')', ')')
- def train():
- # df_train = pd.read_excel('traindata/df_train_20230908.xlsx')
- # df_test = pd.read_excel('traindata/df_test_20230908.xlsx')
- # df_train = pd.read_excel('traindata/df_train_20230912.xlsx')
- # df_test = pd.read_excel('traindata/df_test_20230912.xlsx')
- # df_train = pd.read_excel('traindata/df_train_20230912_predict.xlsx')
- # df_test = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
- # df_train = pd.read_excel('traindata/df_train_20230912_2.xlsx')
- # df_test = pd.read_excel('traindata/df_test_20230912_2.xlsx')
- # df1 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_train.xlsx')
- # df2 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_test.xlsx')
- # df3 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_train.xlsx')
- # df4 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_test.xlsx')
- # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
- # df_train = df_train.append([df1,df3, df, df, df, df], ignore_index=True)
- # df_test = df_test.append([df2,df4, df], ignore_index=True)
- df_train = pd.read_excel('traindata/所有训练测试数据_train.xlsx')
- df_test = pd.read_excel('traindata/所有训练测试数据_test.xlsx')
- df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
- df_train = df_train.append([df, df, df, df], ignore_index=True)
- df_test = df_test.append([df], ignore_index=True)
- df_train = df_train.sample(frac=1)
- df_test = df_test.sample(frac=1)
- df_train['front20'] = df_train['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
- df_train['behind20'] = df_train['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
- df_test['front20'] = df_test['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
- df_test['behind20'] = df_test['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
- # df_train['front20'] = df_train['front'].apply(lambda x: str(x)[-seq_len:])
- # df_train['behind20'] = df_train['behind'].apply(lambda x: str(x)[:seq_len])
- # df_test['front20'] = df_test['front'].apply(lambda x: str(x)[-seq_len:])
- # df_test['behind20'] = df_test['behind'].apply(lambda x: str(x)[:seq_len])
- df_train.fillna("", inplace=True)
- df_test.fillna("", inplace=True)
- if 'relabel' in df_train.columns:
- df_train['new_label'] = df_train.apply(lambda x: int(x['relabel']) if x['relabel'] !="" else int(x['new_label']), axis=1)
- if 'relabel' in df_test.columns:
- df_test['new_label'] = df_test.apply(lambda x: int(x['relabel']) if x['relabel'] !="" else int(x['new_label']), axis=1)
- print('df_train', set(df_train['new_label']), set(df_train['relabel']))
- print('df_test', set(df_test['new_label']), set(df_test['relabel']))
- df_train = df_train[df_train['new_label'].isin([0,1,2,3,4,5])]
- df_test = df_test[df_test['new_label'].isin([0,1,2,3,4,5])]
- print('训练数据:%d,测试数据:%d'%(len(df_train), len(df_test)))
- print(set(df_train['new_label']), set(lb2id.values()))
- assert set(df_train['new_label'])==set(lb2id.values())
- train_x, train_y = word2id(df_train)
- print('train_x.shape', train_x.shape)
- print('train_y.shape', train_y.shape)
- print('train_x: ', train_x[0])
- test_x, test_y = word2id(df_test)
- with tf.Session() as sess:
- vocab, matrix = getVocabAndMatrix(getModel_word())
- model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
- print("loading weights")
- # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
- # model.load_weights("log/ep008-loss0.103-val_loss0.109-f1_score0.970.h5",by_name=True) # 加 多一个dense
- # model.load_weights("log/ep021-loss0.078-val_loss0.104-f1_score0.969.h5",by_name=True) # 加 多一个lstm连接前后lstm输出
- callback = callbacks.ModelCheckpoint(
- filepath="log/" + "ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",
- monitor="val_loss", save_best_only=True, save_weights_only=True, mode="min")
- model.fit(x=[train_x[0],train_x[1]], y=train_y, batch_size=512, epochs=100, callbacks=[callback],
- validation_data=[[test_x[0],test_x[1]], test_y])
- def test():
- # df_val = pd.read_excel('traindata/df_test_20230908.xlsx')
- # df_val = pd.read_excel('traindata/df_test_20230908_predict.xlsx')
- # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据.xlsx')
- # df_val = pd.read_excel('traindata/df_test_20230912_2.xlsx')
- # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
- # df_val = pd.read_excel('traindata/df_train_20230912_2.xlsx')
- # df_val = pd.read_excel('traindata/角色实体分类新旧数据汇总.xlsx')
- # df_val = pd.read_excel('E:/角色金额数据/数据库验证数据原模型识别结果20230926.xlsx')
- # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
- # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果.xlsx')
- # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果60000-90000.xlsx')
- # df_val = pd.read_excel('traindata/2023-08-24所有公告_重新预测结果40000-60000_重新不一致结果.xlsx')
- # df_val = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='不确定角色表达')
- # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_筛选前后文不同的数据.xlsx')
- # df_val = pd.read_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果.xlsx')
- # df_val = pd.read_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果_re.xlsx')
- # df_val = pd.read_excel('E:\实体识别数据/少于10条关键词补充数据.xlsx')
- # df_val = pd.read_excel('traindata/所有训练测试数据_add.xlsx')
- df_val = pd.read_excel('traindata/所有训练测试数据_test.xlsx')
- # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
- # df_val = df_val.append([df], ignore_index=True)
- # df_val = df_val[['entity_id', 'docid', 'label', 'front', 'entity_text', 'behind',
- # 'new_label', 'relabel', 'kws', 'new_old', 'front20', 'behind20',
- # 'front_reverse', 'pred_new', 'prob_new', 'new=lb']]
- # df_val = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据.xlsx')
- # df_val = pd.read_excel('traindata/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据_predict.xlsx')
- # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据_predict_重新标注数据20230919.xlsx')
- lb2id = {'招标人': 0, '代理人': 1, '中标人': 2, '第二候选人': 3, '第三候选人': 4, '其他角色': 5}
- # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据.xlsx')
- # df_val['label'] = df_val.apply(lambda x: x['relabel'] if x['relabel']!='' else x['label'], axis=1)
- # df_val['new_label'] = df_val['label'].apply(lambda x: lb2id[x])
- # df_val['label'] = df_val['label'].apply(lambda x: lb2id[x])
- # df_val['relabel'] = df_val['relabel'].apply(lambda x: lb2id.get(x, ''))
- # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据_predict.xlsx')
- # df_val.fillna('', inplace=True)
- # print('测试公告数量:', len(df_val), set(df_val['new_label']))
- # df_val['new_label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] != '' else x['new_label'], axis=1)
- # # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
- # df_val = pd.read_excel('traindata/df_test_20230912_2.xlsx')
- # print(df_val.columns)
- # df2 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_test.xlsx')
- # df4 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_test.xlsx')
- # # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
- # print(df2.columns)
- # df_val = df_val.append([df2, df4], ignore_index=True)
- # df_val = df_val[['entity_id', 'docid', 'label', 'front', 'entity_text', 'behind',
- # 'new_label', 'relabel', 'kws', 'new_old', 'front20', 'behind20',
- # 'front_reverse', 'pred_new', 'prob_new', 'new=lb']]
- df_val.fillna('', inplace=True)
- # df_val = df_val[df_val['relabel']!=6]
- # for i in df_val.index:
- # b = df_val.loc[i, 'front']
- # e = df_val.loc[i, 'behind']
- # if not isinstance(b, str):
- # print('异常数据', i, type(b))
- # if not isinstance(e, str):
- # print('异常数据', i, type(e))
- if 'new_label' in df_val.columns:
- if 'relabel' in df_val.columns:
- df_val['new_label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] in [0,1,2,3,4,5] else x['new_label'], axis=1)
- else:
- df_val['new_label'] = df_val['label']
- # df_val['new_label'] = df_val['new_label'].apply(lambda x: x if x in [0, 1, 2, 3, 4, 5] else 5)
- # df_val = df_val[df_val['new_label'].isin([0,1,2,3,4,5])]
- print('测试公告数量:', len(df_val), set(df_val['new_label']))
- df_val['new_label'] = df_val['new_label'].apply(lambda x: int(x))
- df_val['front20'] = df_val['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
- df_val['behind20'] = df_val['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
- # df_val.drop_duplicates(subset=['front20', 'behind20'], inplace=True)
- # print('测试公告去重后数量:', len(df_val))
- # df_val['front20'] = df_val['front'].apply(lambda x: str(x)[-seq_len:])
- # df_val['behind20'] = df_val['behind'].apply(lambda x: str(x)[:seq_len])
- df_val['front_reverse'] = df_val['front20'].apply(lambda x: x[-6:][::-1])
- # df_val['label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] !="" else x['label'], axis=1)
- # df_val['label'] = df_val['label'].apply(lambda x:lb2id[x] if x in lb2id else x)
- df_val.reset_index(drop=True, inplace=True)
- val_x, val_y = word2id(df_val, seq_len=seq_len, is_test=True)
- # val_x = np.transpose(np.array(train_x), (1, 0, 2))
- # old_x, old_y = word2id(df_val, seq_len=50)
- # old_x = np.transpose(np.array(old_x), (1, 0, 2))
- role_old = Model_role_classify_word()
- with tf.Session() as sess:
- vocab, matrix = getVocabAndMatrix(getModel_word())
- model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
- print("loading weights")
- # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
- # model.load_weights("log/ep006-loss0.174-val_loss0.234-f1_score0.917.h5",by_name=True, skip_mismatch=True)
- # model.load_weights("log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5",by_name=True, skip_mismatch=True)
- # model.load_weights("log/ep014-loss0.091-val_loss0.110-f1_score0.968.h5",by_name=True, skip_mismatch=True)
- # model.load_weights("log/ep008-loss0.162-val_loss0.173-f1_score0.947.h5",by_name=True) # 20230425 取消实体,合并前后输入 效果不佳,招标代理分不清,特别是 受。。。委托这种
- # model.load_weights("log/ep009-loss0.104-val_loss0.115-f1_score0.966.h5",by_name=True) # 20230425 取消实体,前后分别输入
- # model.load_weights("log/ep008-loss0.103-val_loss0.109-f1_score0.970.h5",by_name=True) # 20230425 取消实体,前后分别输入 多加一个danse
- # model.load_weights("log/ep019-loss0.087-val_loss0.106-f1_score0.968.h5",by_name=True) # 20230425 前后分别输入 中间用公司代替,三输入lstm后合并再次经过lstm
- # model.load_weights("log/ep004-loss0.069-val_loss0.103-f1_score0.971.h5",by_name=True) # 20230425 前后分别输入 去掉实体,2输入lstm后合并再次经过lstm
- # model.load_weights("log/20ep045-loss0.140-val_loss0.181-f1_score0.941.h5",by_name=True) # 20230908 前后分别输入 去掉实体,2输入lstm后合并输出
- # model.load_weights("log/20912ep038-loss0.123-val_loss0.181-f1_score0.947.h5",by_name=True) # 20230908 前后分别输入 去掉实体,2输入lstm后合并输出
- # model.load_weights("log/ep068-loss0.075-val_loss0.190-f1_score0.941.h5",by_name=True) # 20230908 前后分别输入gru 去掉实体
- # model.load_weights("log/gruep043-loss0.124-val_loss0.177-f1_score0.947.h5",by_name=True) # 20230908 前后分别输入gru 去掉实体
- # model.load_weights("log/ep052-loss0.130-val_loss0.216-f1_score0.931.h5",by_name=True) # 20230919 前后分别输入gru 去掉实体 新标注数据+旧数据重新标注
- model.load_weights("log/ep049-loss0.108-val_loss0.185-f1_score0.938.h5",by_name=True) # 20231008 前后分别输入lstm 去掉实体 最终选择结果
- # lg_old = role_old.predict(old_x)
- # df_val['pred_old'] = pd.DataFrame(np.argmax(lg_old, axis=1))
- # df_val['prob_old'] = pd.DataFrame(np.amax(lg_old, axis=1))
- # logit = model.predict([val_x[0], val_x[1], val_x[2]])
- # print('新模型预测结果',logit[:3])
- # print('旧模型预测结果:',lg_old[:3])
- # df_val['pred_new'] = pd.DataFrame(np.argmax(logit, axis=-1))
- # df_val['prob_new'] = pd.DataFrame(np.amax(logit, axis=1))
- # # df_val['new=new3'] = df_val.apply(lambda x: 1 if x['pred_new3'] == x['pred_new2'] else 0, axis=1)
- # df_val['new=old'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
- # df_val['old=lb'] = df_val.apply(lambda x: 1 if x['label'] == x['pred_old'] else 0, axis=1)
- # df_val['new=lb'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
- # logit = model.predict([val_x])
- logit = model.predict([val_x[0],val_x[1]])
- print('新模型预测结果', logit[:3])
- # df_val['pred_new2'] = df_val['pred_new']
- df_val['pred_new'] = pd.DataFrame(np.argmax(logit, axis=-1))
- df_val['prob_new'] = pd.DataFrame(np.amax(logit, axis=1))
- # df_val['new=new2'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['pred_new2'] else 0, axis=1)
- df_val['new=lb'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['new_label'] else 0, axis=1)
- for it in set(df_val['new_label']):
- df_tmp = df_val[df_val['new_label']==it]
- lb = len(df_tmp)
- eq = sum(df_tmp['new=lb'])
- pr = len(df_val[df_val['pred_new']==it])
- acc = eq/pr if pr>0 else 0
- recall = eq/lb if lb>0 else 0
- f1 = 2*recall*acc/(acc+recall) if (acc+recall)>0 else 0
- print('类别:%d, acc:%.4f, recall:%.4f, f1: %.4f'%(it, acc, recall, f1))
- print('旧模型:')
- df_val['old=lb'] = df_val.apply(lambda x: 1 if x['label'] == x['new_label'] else 0, axis=1)
- for it in set(df_val['label']):
- df_tmp = df_val[df_val['new_label']==it]
- lb = len(df_tmp)
- eq = sum(df_tmp['old=lb'])
- pr = len(df_val[df_val['label']==it])
- acc = eq/pr if pr>0 else 0
- recall = eq/lb if lb>0 else 0
- f1 = 2*recall*acc/(acc+recall) if (acc+recall)>0 else 0
- print('类别:%d, acc:%.4f, recall:%.4f, f1: %.4f'%(it, acc, recall, f1))
- # df_val.to_excel('traindata/df_val_predict.xlsx')
- # df_val.to_excel('traindata/兼职标注数据_test29_predict.xlsx')
- # df_val.to_excel('traindata/兼职标注数据_test3_predict.xlsx')
- # df_val.to_excel('traindata/df_test_20230908_predict.xlsx', index=False)
- # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据_predict.xlsx', index=False)
- # df_val.to_excel('traindata/旧训练测试数据_筛选数据_predict_重新标注数据20230919.xlsx', index=False)
- # df_val.to_excel('traindata/旧训练测试数据_筛选数据_predict.xlsx', index=False)
- # df_val.to_excel('traindata/df_test_20230912_predict.xlsx', index=False)
- # df_val.to_excel('traindata/df_test_20230912_加补充数据_predict.xlsx', index=False)
- # df_val.to_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果.xlsx', index=False)
- # df_val.to_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果_re.xlsx', index=False)
- # df_val.to_excel('E:\实体识别数据/少于10条关键词补充数据.xlsx', index=False)
- # df_val.to_excel('traindata/所有训练测试数据_add_predict.xlsx', index=False)
- # df_val.to_excel('traindata/所有训练测试数据_test_predict.xlsx', index=False)
- # df_val.to_excel('traindata/df_train_20230912_predict.xlsx', index=False)
- # df_val = df_val[df_val['new=lb']==0]
- # for i in df_val.index:
- # if ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'front'])) or ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'behind'])):
- # print('过滤异常数据',i ,ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'front'])) or ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'behind'])))
- # df_val.drop(index=i, inplace=True)
- # print('不一致数量: ', len(df_val))
- # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果_重新不一致结果.xlsx', index=False)
- # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果40000-60000_重新不一致结果.xlsx', index=False)
- # df_val.to_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据.xlsx', index=False)
- # df_val.to_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_pred.xlsx', index=False)
- # df_val.to_excel('traindata/角色实体分类新旧数据汇总_predict.xlsx', index=False)
- # df_val.to_excel('E:/角色金额数据/数据库验证数据原模型识别结果20230926_predict.xlsx', index=False)
- # df_val.to_excel('E:\角色金额数据/易错角色表达_predict.xlsx', index=False)
- print('df_val.columns', df_val.columns)
- '''
- 类别:0, acc:0.4199, recall:0.6492, f1: 0.5099
- 类别:1, acc:0.5126, recall:0.7846, f1: 0.6201
- 类别:2, acc:0.4416, recall:0.6632, f1: 0.5301
- 类别:3, acc:0.7455, recall:0.7961, f1: 0.7700
- 类别:4, acc:0.7471, recall:0.8553, f1: 0.7975
- 类别:5, acc:0.9664, recall:0.9100, f1: 0.9373
- 类别:0, acc:0.9537, recall:0.9777, f1: 0.9655
- 类别:1, acc:0.9589, recall:0.9722, f1: 0.9655
- 类别:2, acc:0.9227, recall:0.9502, f1: 0.9363
- 类别:3, acc:0.8750, recall:0.9333, f1: 0.9032
- 类别:4, acc:0.9643, recall:1.0000, f1: 0.9818
- 类别:5, acc:0.9476, recall:0.8690, f1: 0.9066
- 类别:0, acc:0.9393, recall:0.9319, f1: 0.9356
- 类别:1, acc:0.9500, recall:0.9620, f1: 0.9560
- 类别:2, acc:0.9156, recall:0.9406, f1: 0.9279
- 类别:3, acc:0.8857, recall:0.9394, f1: 0.9118
- 类别:4, acc:0.9655, recall:0.9333, f1: 0.9492
- 类别:5, acc:0.9102, recall:0.8990, f1: 0.9046
- 类别:0, acc:0.9357, recall:0.9615, f1: 0.9484
- 类别:1, acc:0.9538, recall:0.9483, f1: 0.9510
- 类别:2, acc:0.9271, recall:0.9366, f1: 0.9318
- 类别:3, acc:0.9600, recall:0.9863, f1: 0.9730
- 类别:4, acc:0.9429, recall:0.9851, f1: 0.9635
- 类别:5, acc:0.9407, recall:0.9098, f1: 0.9250
- 类别:0, acc:0.9402, recall:0.9556, f1: 0.9478
- 类别:1, acc:0.9593, recall:0.9375, f1: 0.9483
- 类别:2, acc:0.9243, recall:0.9412, f1: 0.9327
- 类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
- 类别:4, acc:0.9452, recall:0.9857, f1: 0.9650
- 类别:5, acc:0.9296, recall:0.9058, f1: 0.9176
- 类别:0, acc:0.9468, recall:0.9568, f1: 0.9518
- 类别:1, acc:0.9489, recall:0.9489, f1: 0.9489
- 类别:2, acc:0.9388, recall:0.9312, f1: 0.9350
- 类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
- 类别:4, acc:0.9324, recall:0.9857, f1: 0.9583
- 类别:5, acc:0.9316, recall:0.9202, f1: 0.9258
- 类别:0, acc:0.9455, recall:0.9478, f1: 0.9467
- 类别:1, acc:0.9375, recall:0.9538, f1: 0.9456
- 类别:2, acc:0.9275, recall:0.9295, f1: 0.9285
- 类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
- 类别:4, acc:0.9583, recall:0.9857, f1: 0.9718
- 类别:5, acc:0.9262, recall:0.9159, f1: 0.9210
- 类别:0, acc:0.9331, recall:0.9516, f1: 0.9423
- 类别:1, acc:0.9524, recall:0.9467, f1: 0.9496
- 类别:2, acc:0.9437, recall:0.9089, f1: 0.9260
- 类别:3, acc:0.9565, recall:0.9565, f1: 0.9565
- 类别:4, acc:0.9242, recall:0.9683, f1: 0.9457
- 类别:5, acc:0.9270, recall:0.9261, f1: 0.9266
- 新模型:
- 类别:0, acc:0.9336, recall:0.9225, f1: 0.9280
- 类别:1, acc:0.9389, recall:0.9762, f1: 0.9572
- 类别:2, acc:0.8937, recall:0.9439, f1: 0.9181
- 类别:3, acc:0.9130, recall:1.0000, f1: 0.9545
- 类别:4, acc:0.9545, recall:0.8936, f1: 0.9231
- 类别:5, acc:0.9445, recall:0.9292, f1: 0.9368
- 旧模型:
- 类别:0, acc:0.8323, recall:0.7694, f1: 0.7996
- 类别:1, acc:0.9565, recall:0.8730, f1: 0.9129
- 类别:2, acc:0.8800, recall:0.8491, f1: 0.8643
- 类别:3, acc:0.8723, recall:0.9762, f1: 0.9213
- 类别:4, acc:0.9778, recall:0.9362, f1: 0.9565
- 类别:5, acc:0.8402, recall:0.8878, f1: 0.8633
- '''
- def get_savedModel():
- sess = tf.Session(graph=tf.Graph())
- with sess.as_default():
- with sess.graph.as_default():
- vocab, matrix = getVocabAndMatrix(getModel_word())
- model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
- sess.run(tf.global_variables_initializer())
- # model.load_weights(filepath="log/ep009-loss0.057-val_loss0.076-f1_score0.978.h5")
- # model.load_weights(filepath="log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5") #7月30日训练最优模型20字
- # model.load_weights(filepath="../../dl_dev/role/log/ep015-loss0.090-val_loss0.113-f1_score0.967.h5") #8月5日调整部分招标人标注后重新训练结果20字
- # model.load_weights("log/ep004-loss0.069-val_loss0.103-f1_score0.971.h5", # 20230427
- # model.load_weights("log/ep059-loss0.096-val_loss0.180-f1_score0.945.h5", # 20231008 重新整理标注数据后结果
- # model.load_weights("log/ep059-loss0.101-val_loss0.191-f1_score0.940.h5", # 20231012 重新整理标注数据后结果
- # model.load_weights("log/ep052-loss0.123-val_loss0.194-f1_score0.937.h5", # 20231012 重新整理标注数据后结果
- model.load_weights("log/ep049-loss0.108-val_loss0.185-f1_score0.938.h5", # 20231026 重新整理标注数据后结果
- by_name=True) # 20230425 前后分别输入 去掉实体,2输入lstm后合并再次经过lstm 2023/04/27
- tf.saved_model.simple_save(session=sess,
- export_dir="role_savedmodel2023-10-26", # role_savedmodel2021-8-5
- inputs={"input0": model.input[0],
- "input1": model.input[1],
- }, #"input2": model.input[2]
- outputs={"outputs": model.output})
- def predict_pb():
- # df_val = pd.read_excel('traindata/df_val.xlsx')
- df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
- old_x, old_y = word2id(df_val, seq_len=seq_len)
- # old_x = np.transpose(np.array(old_x), (1, 0, 2))
- sess_role = tf.Session()
- with sess_role.as_default() as sess:
- with sess_role.graph.as_default():
- meta_graph_def = tf.saved_model.loader.load(sess=sess_role, tags=["serve"],
- export_dir="role_savedmodel2023-10-08") # role_savedmodel2021-8-5 role_savedmodel2023-04-27
- signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- signature_def = meta_graph_def.signature_def
- input0 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)
- input1 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)
- # input2 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
- output = sess_role.graph.get_tensor_by_name(
- signature_def[signature_key].outputs["outputs"].name)
- model_role = [[input0, input1], output] #, input2
- lg_old = sess_role.run(output, feed_dict={input0:old_x[0],
- input1:old_x[1],
- }) # input2:old_x[2]
- print(lg_old[:3])
- pos = neg = 0
- for i in range(len(lg_old)):
- if np.argmax(lg_old[i]) != np.argmax(old_y[i]):
- print(np.argmax(lg_old[i]) , np.argmax(old_y[i]))
- neg += 1
- else:
- pos += 1
- print(pos, neg, pos/(pos+neg))
- if __name__ == "__main__":
- # train()
- test()
- # get_savedModel()
- # predict_pb()
- # import tensorflow as tf
- #
- # # X = tf.constant([[[1, 1, 1], [2, 2, 2]],
- # # [[3, 3, 3], [4, 4, 4]],
- # # [[5, 5, 5], [6, 6, 6]]])
- # X = tf.constant([[1, 1, 1], [2, 2, 2]]
- # )
- # print(X.shape)
- # rs = tf.slice(X, [0, 0], [1, -1])
- # with tf.Session() as sess:
- # print(sess.run(rs))
|