money_keras.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # @Author : bidikeji
  4. # @Time : 2021/7/27 0027 15:05
  5. import os
  6. import sys
  7. import h5py
  8. from keras import models,layers,losses,optimizers
  9. sys.path.append(os.path.abspath("../../.."))
  10. import pandas as pd
  11. import math
  12. from keras.callbacks import ModelCheckpoint
  13. from BiddingKG.dl.common.Utils import *
  14. import tensorflow as tf
  15. from keras.models import load_model
  16. lb = ['招标金额','中标金额','其他金额']
  17. id2lb = {k:v for k,v in enumerate(lb)}
  18. lb2id = {v:k for k,v in id2lb.items()}
  19. seq_len = 30
  20. def labeling(label, out_len=3):
  21. out = np.zeros((out_len))
  22. out[label] = 1
  23. return out
  24. def getTrainData(percent=0.9):
  25. df = pd.read_excel('traindata/2兼职标注数据_test22.xlsx')
  26. df2 = pd.read_excel('traindata/原金额模型标注数据.xls')
  27. df = df.append(df2, ignore_index=True)
  28. df.dropna(subset=['left'], inplace=True)
  29. df.fillna('', inplace=True)
  30. if 'relabel' in df.columns:
  31. df['label'] = df.apply(lambda x:x['relabel'] if x['relabel']!="" else x['label'], axis=1)
  32. print('更新标注完成')
  33. for i in df.index:
  34. if df.loc[i, 'label'] not in lb:
  35. print('标签错误:',df.loc[i, 'label'])
  36. df['label'] = df['label'].apply(lambda x:lb2id.get(x, 0))
  37. print('总样本:', len(df))
  38. train_x = []
  39. train_y = []
  40. test_x = []
  41. test_y = []
  42. for before, text, after, label in zip(df["left"], df["center"], df["right"], df["label"]):
  43. before = str(before) if str(before) != "nan" else ""
  44. text = str(text)
  45. after = str(after) if str(after) != "nan" else ""
  46. x = encodeInput([before, text, after], word_len=seq_len, word_flag=True, userFool=False)
  47. y = labeling(label)
  48. if np.random.random() < percent:
  49. train_x.append(x)
  50. train_y.append(y)
  51. else:
  52. test_x.append(x)
  53. test_y.append(y)
  54. return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y), np.transpose(np.array(test_x),
  55. (1, 0, 2)), np.array(test_y)
  56. def word2id(df):
  57. train_x = []
  58. train_y = []
  59. test_x = []
  60. test_y = []
  61. for before, text, after, label in zip(df["left"], df["center"], df["right"], df["label"]):
  62. before = str(before) if str(before) != "nan" else ""
  63. text = str(text)
  64. after = str(after) if str(after) != "nan" else ""
  65. x = encodeInput([before, text, after], word_len=seq_len, word_flag=True, userFool=False)
  66. y = labeling(label)
  67. train_x.append(x)
  68. train_y.append(y)
  69. return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y)
  70. def train():
  71. # pk_file = "traindata/all_data.pk"
  72. # if os.path.exists(pk_file):
  73. # train_x, train_y, test_x, test_y = load(pk_file)
  74. # else:
  75. # train_x, train_y, test_x, test_y = getTrainData()
  76. # save([train_x, train_y, test_x, test_y], pk_file)
  77. df_train = pd.read_excel('traindata/df_train.xlsx')
  78. df_test = pd.read_excel('traindata/df_test.xlsx')
  79. train_x, train_y = word2id(df_train)
  80. test_x, test_y = word2id(df_test)
  81. with tf.Session() as sess:
  82. vocab, matrix = getVocabAndMatrix(getModel_word())
  83. model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=3)
  84. print("loading weights")
  85. # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
  86. callback = ModelCheckpoint(
  87. filepath="log/" + "ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",
  88. monitor="val_loss", save_best_only=True, save_weights_only=True, mode="min")
  89. model.fit(x=[train_x[0], train_x[1], train_x[2]], y=train_y, batch_size=128, epochs=600,callbacks=[callback],
  90. validation_data=[[test_x[0], test_x[1], test_x[2]], test_y]) #
  91. def test(_span = [':预算金额1000000元,中标金额', '1151元', ';']):
  92. input = encodeInput(_span, word_len=seq_len, word_flag=True, userFool=False)
  93. print(input)
  94. graph = tf.get_default_graph()
  95. with graph.as_default():
  96. sess = tf.Session(graph=graph)
  97. with sess.as_default():
  98. vocab, matrix = getVocabAndMatrix(getModel_word())
  99. model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix,
  100. classes=3)
  101. model.load_weights("log/ep007-loss0.079-val_loss0.099-f1_score0.966.h5", by_name=True, skip_mismatch=True)
  102. logit = model.predict([np.array([input[0]]), np.array([input[1]]), np.array([input[2]])])
  103. print(logit)
  104. return logit
  105. def get_savedModel():
  106. sess = tf.Session(graph=tf.Graph())
  107. with sess.as_default():
  108. with sess.graph.as_default():
  109. vocab, matrix = getVocabAndMatrix(getModel_word())
  110. model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=3)
  111. sess.run(tf.global_variables_initializer())
  112. # model.load_weights(filepath="log/ep009-loss0.057-val_loss0.076-f1_score0.978.h5")
  113. # model.load_weights(filepath="log/ep007-loss0.079-val_loss0.099-f1_score0.966.h5") # 2021/7/27调整模型30字最优模型
  114. model.load_weights(filepath="../../dl_dev/money/log/ep029-loss0.081-val_loss0.094-f1_score0.971.h5") # 2021/08/06 调整模型30字最优模型
  115. tf.saved_model.simple_save(session=sess,
  116. # export_dir="money_savedmodel20210727_3",
  117. export_dir="money_savedmodel20210806",
  118. inputs={"input0": model.input[0],
  119. "input1": model.input[1],
  120. "input2": model.input[2]},
  121. outputs={"outputs": model.output})
  122. def tensorboard_model():
  123. with tf.Session(graph=tf.Graph()).as_default() as sess:
  124. with sess.graph.as_default():
  125. tf.saved_model.loader.load(sess, tags=["serve"], export_dir="money_savedmodel1")
  126. tf.summary.FileWriter(graph=sess.graph, logdir="log2")
  127. def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
  128. assert len(input_shape)==3
  129. list_input = []
  130. for i in range(input_shape[0]):
  131. list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
  132. print("list_input",list_input)
  133. list_embedding = []
  134. embedding_input = list_input
  135. embedding = layers.Embedding(len(vocab),input_shape[2],
  136. weights=[embedding_weights] if embedding_weights is not None else None,
  137. trainable=True,name="char_embeding")
  138. for i in range(len(embedding_input)):
  139. print(i)
  140. list_embedding.append(embedding(embedding_input[i]))
  141. print(list_embedding)
  142. list_w2v = list_embedding
  143. list_lstm = []
  144. list_lstm.append(layers.Bidirectional(layers.LSTM(32, dropout=0.5, recurrent_dropout=0.5))(list_w2v[0]))
  145. list_lstm.append(layers.Bidirectional(layers.LSTM(8, dropout=0.5, recurrent_dropout=0.5))(list_w2v[1]))
  146. list_lstm.append(layers.Bidirectional(layers.LSTM(16, dropout=0.5, recurrent_dropout=0.5))(list_w2v[2]))
  147. concat = layers.concatenate(list_lstm)
  148. dropout = layers.Dropout(0.5)(concat)
  149. out = layers.Dense(classes,activation="softmax")(dropout)
  150. model = models.Model(list_input,out)
  151. model.compile(optimizer=optimizers.Adam(lr=0.001),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
  152. model.summary()
  153. return model
  154. def verification():
  155. graph = tf.get_default_graph()
  156. with graph.as_default():
  157. sess = tf.Session(graph=graph)
  158. with sess.as_default():
  159. vocab, matrix = getVocabAndMatrix(getModel_word())
  160. model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix,
  161. classes=3)
  162. model.load_weights("log/ep029-loss0.081-val_loss0.094-f1_score0.971.h5", by_name=True, skip_mismatch=True)
  163. df_val = pd.read_excel('traindata/df_val_predict.xlsx')
  164. val_x, val_y = word2id(df_val)
  165. logit = model.predict([val_x[0], val_x[1], val_x[2]])
  166. lg = np.argmax(logit, axis=-1)
  167. df_val['pred_kera'] = pd.DataFrame(lg)
  168. df_val['prob_kera'] = pd.DataFrame(np.amax(logit, axis=1))
  169. df_val['tf=kera'] = df_val.apply(lambda x:1 if x['pred_kera']==x['pred_tf'] else 0, axis=1)
  170. df_val['tf=lb'] = df_val.apply(lambda x:1 if x['label']==x['pred_tf'] else 0, axis=1)
  171. df_val['kera=lb'] = df_val.apply(lambda x:1 if x['pred_kera']==x['label'] else 0, axis=1)
  172. df_val.to_excel('traindata/df_val_predict2.xlsx')
  173. df = pd.read_excel('traindata/2兼职标注数据_test22.xlsx')
  174. df.fillna('', inplace=True)
  175. df.reset_index(drop=True, inplace=True)
  176. preds = []
  177. if 'relabel' in df.columns:
  178. df['label'] = df.apply(lambda x:x['relabel'] if x['relabel']!="" else x['label'], axis=1)
  179. print('更新标注完成')
  180. for left, center, right, label in zip(df['left'], df['center'], df['right'], df['label']):
  181. _span=[left, center, right]
  182. input = encodeInput(_span, word_len=seq_len, word_flag=True, userFool=False)
  183. logit = model.predict([np.array([input[0]]), np.array([input[1]]), np.array([input[2]])])
  184. lg = np.argmax(logit, axis=-1)[0]
  185. prob = logit[0][lg]
  186. lg = id2lb.get(lg, '')
  187. preds.append(lg)
  188. # if lg != label:
  189. # print(left, '###', center, '###', right)
  190. # print('预测类别:%s, 预测:%.4f, 标签:%s'%(lg, prob, label))
  191. # print()
  192. df['pred'] = pd.DataFrame(preds)
  193. df.to_excel('traindata/2兼职标注数据_test22_predict.xlsx')
  194. if __name__ == "__main__":
  195. # train()
  196. verification()
  197. # test(_span=['预算金额:50万,中标金额:','100.600万','元,'])
  198. # get_savedModel()
  199. # tensorboard_model()