context_model.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # @Author : bidikeji
  4. # @Time : 2021/7/28 0028 11:32
  5. import os
  6. # os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
  7. import sys
  8. sys.path.append(os.path.abspath("../../.."))
  9. import pandas as pd
  10. from BiddingKG.dl.interface.modelFactory import Model_role_classify_word
  11. from BiddingKG.dl.common.Utils import *
  12. import tensorflow as tf
  13. import tensorflow.keras.backend as K
  14. # from tensorflow.keras import layers, models,optimizers,losses,callbacks
  15. from keras import layers, models,optimizers,losses,callbacks
  16. # import keras.backend as K
  17. # from keras.models import Model
  18. from keras.engine.topology import Layer
  19. from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
  20. def recall(y_true, y_pred):
  21. '''
  22. 计算召回率
  23. @Argus:
  24. y_true: 正确的标签
  25. y_pred: 模型预测的标签
  26. @Return
  27. 召回率
  28. '''
  29. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  30. c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
  31. if c3 == 0:
  32. return 0
  33. recall = c1 / c3
  34. return recall
  35. def f1_score(y_true, y_pred):
  36. '''
  37. 计算F1
  38. @Argus:
  39. y_true: 正确的标签
  40. y_pred: 模型预测的标签
  41. @Return
  42. F1值
  43. '''
  44. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  45. c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
  46. c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
  47. precision = c1 / c2
  48. if c3 == 0:
  49. recall = 0
  50. else:
  51. recall = c1 / c3
  52. f1_score = 2 * (precision * recall) / (precision + recall)
  53. return f1_score
  54. def precision(y_true, y_pred):
  55. '''
  56. 计算精确率
  57. @Argus:
  58. y_true: 正确的标签
  59. y_pred: 模型预测的标签
  60. @Return
  61. 精确率
  62. '''
  63. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  64. c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
  65. precision = c1 / c2
  66. return precision
  67. seq_len = 30 # 20
  68. sp = 30
  69. lb2id = {'招标人':0,
  70. '代理人':1,
  71. '中标人':2,
  72. '第二候选人':3,
  73. '第三候选人':4,
  74. '其他角色':5}
  75. def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
  76. # assert len(input_shape)==3
  77. list_input = []
  78. for i in range(input_shape[0]):
  79. list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
  80. list_embedding = []
  81. embedding_input = list_input
  82. embedding = layers.Embedding(len(vocab),input_shape[2],
  83. weights=[embedding_weights] if embedding_weights is not None else None,
  84. mask_zero=True,trainable=True,name="char_embeding")
  85. for i in range(len(embedding_input)):
  86. list_embedding.append(embedding(embedding_input[i]))
  87. list_w2v = list_embedding
  88. list_lstm = []
  89. # list_lstm.append(layers.Bidirectional(layers.GRU(60, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[0])) #dropout=0.5, recurrent_dropout=0.5
  90. # list_lstm.append(layers.Bidirectional(layers.GRU(60, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[1]))
  91. list_lstm.append(layers.Bidirectional(layers.LSTM(120, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[0])) #dropout=0.5, recurrent_dropout=0.5
  92. list_lstm.append(layers.Bidirectional(layers.LSTM(120, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[1]))
  93. concat = layers.concatenate(list_lstm, axis=1)
  94. out = layers.Dense(classes,activation="softmax")(concat)
  95. model = models.Model(list_input,out)
  96. model.compile(optimizer=optimizers.Adam(lr=0.001),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
  97. model.summary()
  98. return model
  99. def labeling(label, out_len=6):
  100. out = np.zeros((out_len))
  101. out[label] = 1
  102. return out
  103. def word2id(df, seq_len=seq_len, is_test=False):
  104. train_x = []
  105. train_y = []
  106. test_x = []
  107. test_y = []
  108. # print(set(df['label']))
  109. # print(set(lb2id))
  110. # if set(df['label']) == set(lb2id):
  111. # df['label'] = df['label'].apply(lambda x:lb2id[x])
  112. for before, text, after, label in zip(df["front20"], df["entity_text"], df["behind20"], df["new_label"]):
  113. before = before if isinstance(before, str) else ""
  114. text = text if isinstance(text, str) else ""
  115. after = after if isinstance(after, str) else ""
  116. b = before.find('。')
  117. if b!=-1: # 分句看不到前面句子
  118. before = before[b+1:]
  119. e = after.find('。')
  120. if e!=-1:
  121. after = after[:e+1]
  122. x = encodeInput([before, after], word_len=seq_len, word_flag=True, userFool=False)
  123. if is_test:
  124. y = label
  125. else:
  126. y = labeling(label)
  127. train_x.append(x)
  128. train_y.append(y)
  129. return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y)
  130. def fix_digit_eng(text):
  131. '''
  132. 处理数字及英文编号等
  133. :param text:
  134. :return:
  135. '''
  136. text = re.sub('第[一二三1-3]([条项章]|中学|医院|附属)|第三方(服务机构)?', 'xxx', text)
  137. text = re.sub('第01(中标|成交)?候选人', '第一中标候选人', text)
  138. text = re.sub('标段[一二三1-3]', '标段d', text)
  139. text = re.sub('第[一二三1-3](标段?|[分子标]?包)', 'd标段', text)
  140. text = re.sub('[a-zA-Z][a-zA-Z0-9=&_—-]{3,}', 'abc', text)
  141. text = re.sub('[【(\[][0-9]{2,}[\])】]|\d+([::.-]\d+)+', 'd', text)
  142. text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)
  143. text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text)
  144. text = re.sub('序号:\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、', '', text)
  145. return text.replace('(', '(').replace(')', ')')
  146. def train():
  147. # df_train = pd.read_excel('traindata/df_train_20230908.xlsx')
  148. # df_test = pd.read_excel('traindata/df_test_20230908.xlsx')
  149. # df_train = pd.read_excel('traindata/df_train_20230912.xlsx')
  150. # df_test = pd.read_excel('traindata/df_test_20230912.xlsx')
  151. # df_train = pd.read_excel('traindata/df_train_20230912_predict.xlsx')
  152. # df_test = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
  153. # df_train = pd.read_excel('traindata/df_train_20230912_2.xlsx')
  154. # df_test = pd.read_excel('traindata/df_test_20230912_2.xlsx')
  155. # df1 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_train.xlsx')
  156. # df2 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_test.xlsx')
  157. # df3 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_train.xlsx')
  158. # df4 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_test.xlsx')
  159. # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
  160. # df_train = df_train.append([df1,df3, df, df, df, df], ignore_index=True)
  161. # df_test = df_test.append([df2,df4, df], ignore_index=True)
  162. df_train = pd.read_excel('traindata/所有训练测试数据_train.xlsx')
  163. df_test = pd.read_excel('traindata/所有训练测试数据_test.xlsx')
  164. df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
  165. df_train = df_train.append([df, df, df, df], ignore_index=True)
  166. df_test = df_test.append([df], ignore_index=True)
  167. df_train = df_train.sample(frac=1)
  168. df_test = df_test.sample(frac=1)
  169. df_train['front20'] = df_train['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
  170. df_train['behind20'] = df_train['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
  171. df_test['front20'] = df_test['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
  172. df_test['behind20'] = df_test['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
  173. # df_train['front20'] = df_train['front'].apply(lambda x: str(x)[-seq_len:])
  174. # df_train['behind20'] = df_train['behind'].apply(lambda x: str(x)[:seq_len])
  175. # df_test['front20'] = df_test['front'].apply(lambda x: str(x)[-seq_len:])
  176. # df_test['behind20'] = df_test['behind'].apply(lambda x: str(x)[:seq_len])
  177. df_train.fillna("", inplace=True)
  178. df_test.fillna("", inplace=True)
  179. if 'relabel' in df_train.columns:
  180. df_train['new_label'] = df_train.apply(lambda x: int(x['relabel']) if x['relabel'] !="" else int(x['new_label']), axis=1)
  181. if 'relabel' in df_test.columns:
  182. df_test['new_label'] = df_test.apply(lambda x: int(x['relabel']) if x['relabel'] !="" else int(x['new_label']), axis=1)
  183. print('df_train', set(df_train['new_label']), set(df_train['relabel']))
  184. print('df_test', set(df_test['new_label']), set(df_test['relabel']))
  185. df_train = df_train[df_train['new_label'].isin([0,1,2,3,4,5])]
  186. df_test = df_test[df_test['new_label'].isin([0,1,2,3,4,5])]
  187. print('训练数据:%d,测试数据:%d'%(len(df_train), len(df_test)))
  188. print(set(df_train['new_label']), set(lb2id.values()))
  189. assert set(df_train['new_label'])==set(lb2id.values())
  190. train_x, train_y = word2id(df_train)
  191. print('train_x.shape', train_x.shape)
  192. print('train_y.shape', train_y.shape)
  193. print('train_x: ', train_x[0])
  194. test_x, test_y = word2id(df_test)
  195. with tf.Session() as sess:
  196. vocab, matrix = getVocabAndMatrix(getModel_word())
  197. model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
  198. print("loading weights")
  199. # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
  200. # model.load_weights("log/ep008-loss0.103-val_loss0.109-f1_score0.970.h5",by_name=True) # 加 多一个dense
  201. # model.load_weights("log/ep021-loss0.078-val_loss0.104-f1_score0.969.h5",by_name=True) # 加 多一个lstm连接前后lstm输出
  202. callback = callbacks.ModelCheckpoint(
  203. filepath="log/" + "ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",
  204. monitor="val_loss", save_best_only=True, save_weights_only=True, mode="min")
  205. model.fit(x=[train_x[0],train_x[1]], y=train_y, batch_size=512, epochs=100, callbacks=[callback],
  206. validation_data=[[test_x[0],test_x[1]], test_y])
  207. def test():
  208. # df_val = pd.read_excel('traindata/df_test_20230908.xlsx')
  209. # df_val = pd.read_excel('traindata/df_test_20230908_predict.xlsx')
  210. # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据.xlsx')
  211. # df_val = pd.read_excel('traindata/df_test_20230912_2.xlsx')
  212. # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
  213. # df_val = pd.read_excel('traindata/df_train_20230912_2.xlsx')
  214. # df_val = pd.read_excel('traindata/角色实体分类新旧数据汇总.xlsx')
  215. # df_val = pd.read_excel('E:/角色金额数据/数据库验证数据原模型识别结果20230926.xlsx')
  216. # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
  217. # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果.xlsx')
  218. # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果60000-90000.xlsx')
  219. # df_val = pd.read_excel('traindata/2023-08-24所有公告_重新预测结果40000-60000_重新不一致结果.xlsx')
  220. # df_val = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='不确定角色表达')
  221. # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_筛选前后文不同的数据.xlsx')
  222. # df_val = pd.read_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果.xlsx')
  223. # df_val = pd.read_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果_re.xlsx')
  224. # df_val = pd.read_excel('E:\实体识别数据/少于10条关键词补充数据.xlsx')
  225. # df_val = pd.read_excel('traindata/所有训练测试数据_add.xlsx')
  226. df_val = pd.read_excel('traindata/所有训练测试数据_test.xlsx')
  227. # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
  228. # df_val = df_val.append([df], ignore_index=True)
  229. # df_val = df_val[['entity_id', 'docid', 'label', 'front', 'entity_text', 'behind',
  230. # 'new_label', 'relabel', 'kws', 'new_old', 'front20', 'behind20',
  231. # 'front_reverse', 'pred_new', 'prob_new', 'new=lb']]
  232. # df_val = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据.xlsx')
  233. # df_val = pd.read_excel('traindata/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据_predict.xlsx')
  234. # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据_predict_重新标注数据20230919.xlsx')
  235. lb2id = {'招标人': 0, '代理人': 1, '中标人': 2, '第二候选人': 3, '第三候选人': 4, '其他角色': 5}
  236. # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据.xlsx')
  237. # df_val['label'] = df_val.apply(lambda x: x['relabel'] if x['relabel']!='' else x['label'], axis=1)
  238. # df_val['new_label'] = df_val['label'].apply(lambda x: lb2id[x])
  239. # df_val['label'] = df_val['label'].apply(lambda x: lb2id[x])
  240. # df_val['relabel'] = df_val['relabel'].apply(lambda x: lb2id.get(x, ''))
  241. # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据_predict.xlsx')
  242. # df_val.fillna('', inplace=True)
  243. # print('测试公告数量:', len(df_val), set(df_val['new_label']))
  244. # df_val['new_label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] != '' else x['new_label'], axis=1)
  245. # # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
  246. # df_val = pd.read_excel('traindata/df_test_20230912_2.xlsx')
  247. # print(df_val.columns)
  248. # df2 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_test.xlsx')
  249. # df4 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_test.xlsx')
  250. # # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
  251. # print(df2.columns)
  252. # df_val = df_val.append([df2, df4], ignore_index=True)
  253. # df_val = df_val[['entity_id', 'docid', 'label', 'front', 'entity_text', 'behind',
  254. # 'new_label', 'relabel', 'kws', 'new_old', 'front20', 'behind20',
  255. # 'front_reverse', 'pred_new', 'prob_new', 'new=lb']]
  256. df_val.fillna('', inplace=True)
  257. # df_val = df_val[df_val['relabel']!=6]
  258. # for i in df_val.index:
  259. # b = df_val.loc[i, 'front']
  260. # e = df_val.loc[i, 'behind']
  261. # if not isinstance(b, str):
  262. # print('异常数据', i, type(b))
  263. # if not isinstance(e, str):
  264. # print('异常数据', i, type(e))
  265. if 'new_label' in df_val.columns:
  266. if 'relabel' in df_val.columns:
  267. df_val['new_label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] in [0,1,2,3,4,5] else x['new_label'], axis=1)
  268. else:
  269. df_val['new_label'] = df_val['label']
  270. # df_val['new_label'] = df_val['new_label'].apply(lambda x: x if x in [0, 1, 2, 3, 4, 5] else 5)
  271. # df_val = df_val[df_val['new_label'].isin([0,1,2,3,4,5])]
  272. print('测试公告数量:', len(df_val), set(df_val['new_label']))
  273. df_val['new_label'] = df_val['new_label'].apply(lambda x: int(x))
  274. df_val['front20'] = df_val['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
  275. df_val['behind20'] = df_val['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
  276. # df_val.drop_duplicates(subset=['front20', 'behind20'], inplace=True)
  277. # print('测试公告去重后数量:', len(df_val))
  278. # df_val['front20'] = df_val['front'].apply(lambda x: str(x)[-seq_len:])
  279. # df_val['behind20'] = df_val['behind'].apply(lambda x: str(x)[:seq_len])
  280. df_val['front_reverse'] = df_val['front20'].apply(lambda x: x[-6:][::-1])
  281. # df_val['label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] !="" else x['label'], axis=1)
  282. # df_val['label'] = df_val['label'].apply(lambda x:lb2id[x] if x in lb2id else x)
  283. df_val.reset_index(drop=True, inplace=True)
  284. val_x, val_y = word2id(df_val, seq_len=seq_len, is_test=True)
  285. # val_x = np.transpose(np.array(train_x), (1, 0, 2))
  286. # old_x, old_y = word2id(df_val, seq_len=50)
  287. # old_x = np.transpose(np.array(old_x), (1, 0, 2))
  288. role_old = Model_role_classify_word()
  289. with tf.Session() as sess:
  290. vocab, matrix = getVocabAndMatrix(getModel_word())
  291. model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
  292. print("loading weights")
  293. # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
  294. # model.load_weights("log/ep006-loss0.174-val_loss0.234-f1_score0.917.h5",by_name=True, skip_mismatch=True)
  295. # model.load_weights("log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5",by_name=True, skip_mismatch=True)
  296. # model.load_weights("log/ep014-loss0.091-val_loss0.110-f1_score0.968.h5",by_name=True, skip_mismatch=True)
  297. # model.load_weights("log/ep008-loss0.162-val_loss0.173-f1_score0.947.h5",by_name=True) # 20230425 取消实体,合并前后输入 效果不佳,招标代理分不清,特别是 受。。。委托这种
  298. # model.load_weights("log/ep009-loss0.104-val_loss0.115-f1_score0.966.h5",by_name=True) # 20230425 取消实体,前后分别输入
  299. # model.load_weights("log/ep008-loss0.103-val_loss0.109-f1_score0.970.h5",by_name=True) # 20230425 取消实体,前后分别输入 多加一个danse
  300. # model.load_weights("log/ep019-loss0.087-val_loss0.106-f1_score0.968.h5",by_name=True) # 20230425 前后分别输入 中间用公司代替,三输入lstm后合并再次经过lstm
  301. # model.load_weights("log/ep004-loss0.069-val_loss0.103-f1_score0.971.h5",by_name=True) # 20230425 前后分别输入 去掉实体,2输入lstm后合并再次经过lstm
  302. # model.load_weights("log/20ep045-loss0.140-val_loss0.181-f1_score0.941.h5",by_name=True) # 20230908 前后分别输入 去掉实体,2输入lstm后合并输出
  303. # model.load_weights("log/20912ep038-loss0.123-val_loss0.181-f1_score0.947.h5",by_name=True) # 20230908 前后分别输入 去掉实体,2输入lstm后合并输出
  304. # model.load_weights("log/ep068-loss0.075-val_loss0.190-f1_score0.941.h5",by_name=True) # 20230908 前后分别输入gru 去掉实体
  305. # model.load_weights("log/gruep043-loss0.124-val_loss0.177-f1_score0.947.h5",by_name=True) # 20230908 前后分别输入gru 去掉实体
  306. # model.load_weights("log/ep052-loss0.130-val_loss0.216-f1_score0.931.h5",by_name=True) # 20230919 前后分别输入gru 去掉实体 新标注数据+旧数据重新标注
  307. model.load_weights("log/ep049-loss0.108-val_loss0.185-f1_score0.938.h5",by_name=True) # 20231008 前后分别输入lstm 去掉实体 最终选择结果
  308. # lg_old = role_old.predict(old_x)
  309. # df_val['pred_old'] = pd.DataFrame(np.argmax(lg_old, axis=1))
  310. # df_val['prob_old'] = pd.DataFrame(np.amax(lg_old, axis=1))
  311. # logit = model.predict([val_x[0], val_x[1], val_x[2]])
  312. # print('新模型预测结果',logit[:3])
  313. # print('旧模型预测结果:',lg_old[:3])
  314. # df_val['pred_new'] = pd.DataFrame(np.argmax(logit, axis=-1))
  315. # df_val['prob_new'] = pd.DataFrame(np.amax(logit, axis=1))
  316. # # df_val['new=new3'] = df_val.apply(lambda x: 1 if x['pred_new3'] == x['pred_new2'] else 0, axis=1)
  317. # df_val['new=old'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
  318. # df_val['old=lb'] = df_val.apply(lambda x: 1 if x['label'] == x['pred_old'] else 0, axis=1)
  319. # df_val['new=lb'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
  320. # logit = model.predict([val_x])
  321. logit = model.predict([val_x[0],val_x[1]])
  322. print('新模型预测结果', logit[:3])
  323. # df_val['pred_new2'] = df_val['pred_new']
  324. df_val['pred_new'] = pd.DataFrame(np.argmax(logit, axis=-1))
  325. df_val['prob_new'] = pd.DataFrame(np.amax(logit, axis=1))
  326. # df_val['new=new2'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['pred_new2'] else 0, axis=1)
  327. df_val['new=lb'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['new_label'] else 0, axis=1)
  328. for it in set(df_val['new_label']):
  329. df_tmp = df_val[df_val['new_label']==it]
  330. lb = len(df_tmp)
  331. eq = sum(df_tmp['new=lb'])
  332. pr = len(df_val[df_val['pred_new']==it])
  333. acc = eq/pr if pr>0 else 0
  334. recall = eq/lb if lb>0 else 0
  335. f1 = 2*recall*acc/(acc+recall) if (acc+recall)>0 else 0
  336. print('类别:%d, acc:%.4f, recall:%.4f, f1: %.4f'%(it, acc, recall, f1))
  337. print('旧模型:')
  338. df_val['old=lb'] = df_val.apply(lambda x: 1 if x['label'] == x['new_label'] else 0, axis=1)
  339. for it in set(df_val['label']):
  340. df_tmp = df_val[df_val['new_label']==it]
  341. lb = len(df_tmp)
  342. eq = sum(df_tmp['old=lb'])
  343. pr = len(df_val[df_val['label']==it])
  344. acc = eq/pr if pr>0 else 0
  345. recall = eq/lb if lb>0 else 0
  346. f1 = 2*recall*acc/(acc+recall) if (acc+recall)>0 else 0
  347. print('类别:%d, acc:%.4f, recall:%.4f, f1: %.4f'%(it, acc, recall, f1))
  348. # df_val.to_excel('traindata/df_val_predict.xlsx')
  349. # df_val.to_excel('traindata/兼职标注数据_test29_predict.xlsx')
  350. # df_val.to_excel('traindata/兼职标注数据_test3_predict.xlsx')
  351. # df_val.to_excel('traindata/df_test_20230908_predict.xlsx', index=False)
  352. # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据_predict.xlsx', index=False)
  353. # df_val.to_excel('traindata/旧训练测试数据_筛选数据_predict_重新标注数据20230919.xlsx', index=False)
  354. # df_val.to_excel('traindata/旧训练测试数据_筛选数据_predict.xlsx', index=False)
  355. # df_val.to_excel('traindata/df_test_20230912_predict.xlsx', index=False)
  356. # df_val.to_excel('traindata/df_test_20230912_加补充数据_predict.xlsx', index=False)
  357. # df_val.to_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果.xlsx', index=False)
  358. # df_val.to_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果_re.xlsx', index=False)
  359. # df_val.to_excel('E:\实体识别数据/少于10条关键词补充数据.xlsx', index=False)
  360. # df_val.to_excel('traindata/所有训练测试数据_add_predict.xlsx', index=False)
  361. # df_val.to_excel('traindata/所有训练测试数据_test_predict.xlsx', index=False)
  362. # df_val.to_excel('traindata/df_train_20230912_predict.xlsx', index=False)
  363. # df_val = df_val[df_val['new=lb']==0]
  364. # for i in df_val.index:
  365. # if ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'front'])) or ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'behind'])):
  366. # print('过滤异常数据',i ,ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'front'])) or ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'behind'])))
  367. # df_val.drop(index=i, inplace=True)
  368. # print('不一致数量: ', len(df_val))
  369. # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果_重新不一致结果.xlsx', index=False)
  370. # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果40000-60000_重新不一致结果.xlsx', index=False)
  371. # df_val.to_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据.xlsx', index=False)
  372. # df_val.to_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_pred.xlsx', index=False)
  373. # df_val.to_excel('traindata/角色实体分类新旧数据汇总_predict.xlsx', index=False)
  374. # df_val.to_excel('E:/角色金额数据/数据库验证数据原模型识别结果20230926_predict.xlsx', index=False)
  375. # df_val.to_excel('E:\角色金额数据/易错角色表达_predict.xlsx', index=False)
  376. print('df_val.columns', df_val.columns)
  377. '''
  378. 类别:0, acc:0.4199, recall:0.6492, f1: 0.5099
  379. 类别:1, acc:0.5126, recall:0.7846, f1: 0.6201
  380. 类别:2, acc:0.4416, recall:0.6632, f1: 0.5301
  381. 类别:3, acc:0.7455, recall:0.7961, f1: 0.7700
  382. 类别:4, acc:0.7471, recall:0.8553, f1: 0.7975
  383. 类别:5, acc:0.9664, recall:0.9100, f1: 0.9373
  384. 类别:0, acc:0.9537, recall:0.9777, f1: 0.9655
  385. 类别:1, acc:0.9589, recall:0.9722, f1: 0.9655
  386. 类别:2, acc:0.9227, recall:0.9502, f1: 0.9363
  387. 类别:3, acc:0.8750, recall:0.9333, f1: 0.9032
  388. 类别:4, acc:0.9643, recall:1.0000, f1: 0.9818
  389. 类别:5, acc:0.9476, recall:0.8690, f1: 0.9066
  390. 类别:0, acc:0.9393, recall:0.9319, f1: 0.9356
  391. 类别:1, acc:0.9500, recall:0.9620, f1: 0.9560
  392. 类别:2, acc:0.9156, recall:0.9406, f1: 0.9279
  393. 类别:3, acc:0.8857, recall:0.9394, f1: 0.9118
  394. 类别:4, acc:0.9655, recall:0.9333, f1: 0.9492
  395. 类别:5, acc:0.9102, recall:0.8990, f1: 0.9046
  396. 类别:0, acc:0.9357, recall:0.9615, f1: 0.9484
  397. 类别:1, acc:0.9538, recall:0.9483, f1: 0.9510
  398. 类别:2, acc:0.9271, recall:0.9366, f1: 0.9318
  399. 类别:3, acc:0.9600, recall:0.9863, f1: 0.9730
  400. 类别:4, acc:0.9429, recall:0.9851, f1: 0.9635
  401. 类别:5, acc:0.9407, recall:0.9098, f1: 0.9250
  402. 类别:0, acc:0.9402, recall:0.9556, f1: 0.9478
  403. 类别:1, acc:0.9593, recall:0.9375, f1: 0.9483
  404. 类别:2, acc:0.9243, recall:0.9412, f1: 0.9327
  405. 类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
  406. 类别:4, acc:0.9452, recall:0.9857, f1: 0.9650
  407. 类别:5, acc:0.9296, recall:0.9058, f1: 0.9176
  408. 类别:0, acc:0.9468, recall:0.9568, f1: 0.9518
  409. 类别:1, acc:0.9489, recall:0.9489, f1: 0.9489
  410. 类别:2, acc:0.9388, recall:0.9312, f1: 0.9350
  411. 类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
  412. 类别:4, acc:0.9324, recall:0.9857, f1: 0.9583
  413. 类别:5, acc:0.9316, recall:0.9202, f1: 0.9258
  414. 类别:0, acc:0.9455, recall:0.9478, f1: 0.9467
  415. 类别:1, acc:0.9375, recall:0.9538, f1: 0.9456
  416. 类别:2, acc:0.9275, recall:0.9295, f1: 0.9285
  417. 类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
  418. 类别:4, acc:0.9583, recall:0.9857, f1: 0.9718
  419. 类别:5, acc:0.9262, recall:0.9159, f1: 0.9210
  420. 类别:0, acc:0.9331, recall:0.9516, f1: 0.9423
  421. 类别:1, acc:0.9524, recall:0.9467, f1: 0.9496
  422. 类别:2, acc:0.9437, recall:0.9089, f1: 0.9260
  423. 类别:3, acc:0.9565, recall:0.9565, f1: 0.9565
  424. 类别:4, acc:0.9242, recall:0.9683, f1: 0.9457
  425. 类别:5, acc:0.9270, recall:0.9261, f1: 0.9266
  426. 新模型:
  427. 类别:0, acc:0.9336, recall:0.9225, f1: 0.9280
  428. 类别:1, acc:0.9389, recall:0.9762, f1: 0.9572
  429. 类别:2, acc:0.8937, recall:0.9439, f1: 0.9181
  430. 类别:3, acc:0.9130, recall:1.0000, f1: 0.9545
  431. 类别:4, acc:0.9545, recall:0.8936, f1: 0.9231
  432. 类别:5, acc:0.9445, recall:0.9292, f1: 0.9368
  433. 旧模型:
  434. 类别:0, acc:0.8323, recall:0.7694, f1: 0.7996
  435. 类别:1, acc:0.9565, recall:0.8730, f1: 0.9129
  436. 类别:2, acc:0.8800, recall:0.8491, f1: 0.8643
  437. 类别:3, acc:0.8723, recall:0.9762, f1: 0.9213
  438. 类别:4, acc:0.9778, recall:0.9362, f1: 0.9565
  439. 类别:5, acc:0.8402, recall:0.8878, f1: 0.8633
  440. '''
  441. def get_savedModel():
  442. sess = tf.Session(graph=tf.Graph())
  443. with sess.as_default():
  444. with sess.graph.as_default():
  445. vocab, matrix = getVocabAndMatrix(getModel_word())
  446. model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
  447. sess.run(tf.global_variables_initializer())
  448. # model.load_weights(filepath="log/ep009-loss0.057-val_loss0.076-f1_score0.978.h5")
  449. # model.load_weights(filepath="log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5") #7月30日训练最优模型20字
  450. # model.load_weights(filepath="../../dl_dev/role/log/ep015-loss0.090-val_loss0.113-f1_score0.967.h5") #8月5日调整部分招标人标注后重新训练结果20字
  451. # model.load_weights("log/ep004-loss0.069-val_loss0.103-f1_score0.971.h5", # 20230427
  452. # model.load_weights("log/ep059-loss0.096-val_loss0.180-f1_score0.945.h5", # 20231008 重新整理标注数据后结果
  453. # model.load_weights("log/ep059-loss0.101-val_loss0.191-f1_score0.940.h5", # 20231012 重新整理标注数据后结果
  454. # model.load_weights("log/ep052-loss0.123-val_loss0.194-f1_score0.937.h5", # 20231012 重新整理标注数据后结果
  455. model.load_weights("log/ep049-loss0.108-val_loss0.185-f1_score0.938.h5", # 20231026 重新整理标注数据后结果
  456. by_name=True) # 20230425 前后分别输入 去掉实体,2输入lstm后合并再次经过lstm 2023/04/27
  457. tf.saved_model.simple_save(session=sess,
  458. export_dir="role_savedmodel2023-10-26", # role_savedmodel2021-8-5
  459. inputs={"input0": model.input[0],
  460. "input1": model.input[1],
  461. }, #"input2": model.input[2]
  462. outputs={"outputs": model.output})
  463. def predict_pb():
  464. # df_val = pd.read_excel('traindata/df_val.xlsx')
  465. df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
  466. old_x, old_y = word2id(df_val, seq_len=seq_len)
  467. # old_x = np.transpose(np.array(old_x), (1, 0, 2))
  468. sess_role = tf.Session()
  469. with sess_role.as_default() as sess:
  470. with sess_role.graph.as_default():
  471. meta_graph_def = tf.saved_model.loader.load(sess=sess_role, tags=["serve"],
  472. export_dir="role_savedmodel2023-10-08") # role_savedmodel2021-8-5 role_savedmodel2023-04-27
  473. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  474. signature_def = meta_graph_def.signature_def
  475. input0 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)
  476. input1 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)
  477. # input2 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
  478. output = sess_role.graph.get_tensor_by_name(
  479. signature_def[signature_key].outputs["outputs"].name)
  480. model_role = [[input0, input1], output] #, input2
  481. lg_old = sess_role.run(output, feed_dict={input0:old_x[0],
  482. input1:old_x[1],
  483. }) # input2:old_x[2]
  484. print(lg_old[:3])
  485. pos = neg = 0
  486. for i in range(len(lg_old)):
  487. if np.argmax(lg_old[i]) != np.argmax(old_y[i]):
  488. print(np.argmax(lg_old[i]) , np.argmax(old_y[i]))
  489. neg += 1
  490. else:
  491. pos += 1
  492. print(pos, neg, pos/(pos+neg))
  493. if __name__ == "__main__":
  494. # train()
  495. test()
  496. # get_savedModel()
  497. # predict_pb()
  498. # import tensorflow as tf
  499. #
  500. # # X = tf.constant([[[1, 1, 1], [2, 2, 2]],
  501. # # [[3, 3, 3], [4, 4, 4]],
  502. # # [[5, 5, 5], [6, 6, 6]]])
  503. # X = tf.constant([[1, 1, 1], [2, 2, 2]]
  504. # )
  505. # print(X.shape)
  506. # rs = tf.slice(X, [0, 0], [1, -1])
  507. # with tf.Session() as sess:
  508. # print(sess.run(rs))