|
@@ -3,38 +3,54 @@
|
|
|
# @Author : bidikeji
|
|
|
# @Time : 2021/1/13 0013 14:03
|
|
|
from BiddingKG.dl.product.product_model import Product_Model
|
|
|
-from BiddingKG.dl.product.data_util import BatchManager, get_label_data, id_to_tag, input_from_line, decode, result_to_json
|
|
|
+from BiddingKG.dl.product.data_util import BatchManager, get_label_data, id_to_tag, input_from_line, decode, result_to_json, df2data,dfsearchlb
|
|
|
+from BiddingKG.dl.product.data_process import data_precess
|
|
|
import numpy as np
|
|
|
+import pandas as pd
|
|
|
import tensorflow as tf
|
|
|
import random
|
|
|
import pickle
|
|
|
import os
|
|
|
+import glob
|
|
|
+os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
|
|
|
|
|
|
def train():
|
|
|
# all_data = get_label_data()
|
|
|
# random.shuffle(all_data)
|
|
|
# train_data = all_data[:int(len(all_data)*0.85)]
|
|
|
# dev_data = all_data[int(len(all_data)*0.85):]
|
|
|
- # with open('data/train_data2.pkl', 'wb') as f:
|
|
|
- # pickle.dump(train_data, f)
|
|
|
- # with open('data/dev_data2.pkl', 'wb') as f:
|
|
|
- # pickle.dump(dev_data, f)
|
|
|
|
|
|
- with open('data/train_data2.pkl', 'rb') as f:
|
|
|
- train_data = pickle.load(f)
|
|
|
- with open('data/dev_data2.pkl', 'rb') as f:
|
|
|
- dev_data = pickle.load(f)
|
|
|
+ # df = pd.read_excel('data/所有产品标注数据筛选20211125.xlsx')
|
|
|
+ # df.reset_index(drop=True, inplace=True)
|
|
|
+ # np.random.seed(8)
|
|
|
+ # shuffle_ids = np.random.permutation(len(df))
|
|
|
+ # split_ids = int(len(df)*0.1)
|
|
|
+ # train_ids = shuffle_ids[split_ids:]
|
|
|
+ # dev_ids = shuffle_ids[:int(split_ids/2)]
|
|
|
+ # df_train = df.iloc[train_ids]
|
|
|
+ # df_dev = df.iloc[dev_ids]
|
|
|
+ # train_data = df2data(df_train)
|
|
|
+ # dev_data = df2data(df_dev)
|
|
|
|
|
|
- train_manager = BatchManager(train_data, batch_size=128)
|
|
|
- dev_manager = BatchManager(dev_data, batch_size=64)
|
|
|
+ # with open(os.path.dirname(__file__)+'/data/train_data2021-11-30.pkl', 'rb') as f:
|
|
|
+ # train_data = pickle.load(f)
|
|
|
+ # with open(os.path.dirname(__file__)+'data/dev_data2021-11-30.pkl', 'rb') as f:
|
|
|
+ # dev_data = pickle.load(f)
|
|
|
|
|
|
- tf_config = tf.ConfigProto()
|
|
|
- tf_config.gpu_options.allow_growth = True
|
|
|
+ train_data, dev_data = data_precess()
|
|
|
+
|
|
|
+ train_manager = BatchManager(train_data, batch_size=256)
|
|
|
+ dev_manager = BatchManager(dev_data, batch_size=256)
|
|
|
+
|
|
|
+ # tf_config = tf.ConfigProto()
|
|
|
+ # tf_config.gpu_options.allow_growth = True
|
|
|
+ tf_config = tf.ConfigProto(device_count={'gpu': 1})
|
|
|
steps_per_epoch = train_manager.len_data
|
|
|
- ckpt_path = "model"
|
|
|
+ ckpt_path = os.path.dirname(__file__)+'/'+"model"
|
|
|
with tf.Session(config=tf_config) as sess:
|
|
|
model = Product_Model()
|
|
|
sess.run(tf.global_variables_initializer())
|
|
|
+ model.saver.restore(sess, os.path.join(ckpt_path, "ner2.ckpt"))
|
|
|
# ckpt = tf.train.get_checkpoint_state(ckpt_path)
|
|
|
# if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
|
|
|
# model.saver.restore(sess, ckpt.model_checkpoint_path)
|
|
@@ -44,7 +60,7 @@ def train():
|
|
|
loss = []
|
|
|
mix_loss = 1000
|
|
|
max_f1 = 0
|
|
|
- for i in range(100):
|
|
|
+ for i in range(20):
|
|
|
print('epochs:',i)
|
|
|
# model.evaluate(sess, data_manager=dev_manager, id_to_tag=id_to_tag)
|
|
|
# break
|
|
@@ -53,20 +69,21 @@ def train():
|
|
|
# step, batch_loss = model.run_step(sess, True, batch)
|
|
|
step, batch_loss = model.run_step(sess, 'train', batch)
|
|
|
loss.append(batch_loss)
|
|
|
- if step % 10 == 0:
|
|
|
+ if step % 1000 == 0:
|
|
|
iteration = step // steps_per_epoch + 1
|
|
|
print('iter:{} step:{} loss:{}'.format(iteration, step, np.mean(loss)))
|
|
|
- if i >= 50 or i%5==0:
|
|
|
+ if i >= 2 or i%5==0:
|
|
|
f1, precision, recall, evl_loss = model.evaluate(sess, data_manager=dev_manager, id_to_tag=id_to_tag)
|
|
|
print('f1:%.4f, precision:%.4f, recall:%.4f, evl_loss:%.4f' % (f1, precision, recall, evl_loss))
|
|
|
- if max_f1 < f1:
|
|
|
- model.saver.save(sess, os.path.join(ckpt_path, "ner2.ckpt"))
|
|
|
- print("model save .bast f1 is %.4f" % f1)
|
|
|
+ # if max_f1 < f1:
|
|
|
+ # model.saver.save(sess, os.path.join(ckpt_path, "ner2.ckpt"))
|
|
|
+ # print("model save .bast f1 is %.4f" % f1)
|
|
|
+ # max_f1 = f1
|
|
|
+ if evl_loss<mix_loss and max_f1 < f1:
|
|
|
+ mix_loss = evl_loss
|
|
|
max_f1 = f1
|
|
|
- # if np.mean(loss)<mix_loss:
|
|
|
- # mix_loss = np.mean(loss)
|
|
|
- # model.saver.save(sess, os.path.join(ckpt_path, "ner.ckpt"))
|
|
|
- # print("model saved, loss is:",mix_loss)
|
|
|
+ model.saver.save(sess, os.path.join(ckpt_path, "ner1202_find_lb.ckpt")) #ner1130_find_lb.ckpt
|
|
|
+ print("model saved, val_loss is:",mix_loss)
|
|
|
loss = []
|
|
|
|
|
|
def evaluate_line():
|
|
@@ -74,15 +91,22 @@ def evaluate_line():
|
|
|
with tf.Session() as sess:
|
|
|
model = Product_Model()
|
|
|
sess.run(tf.global_variables_initializer())
|
|
|
- ckpt = tf.train.get_checkpoint_state(ckpt_path)
|
|
|
- if ckpt and tf.train.checkpoint_exists(ckpt_path):
|
|
|
- print('模型文件:',ckpt.model_checkpoint_path)
|
|
|
- model.saver.restore(sess, ckpt.model_checkpoint_path)
|
|
|
- print(model.logits, model.lengths, model.trans, model.dropout, model.char_inputs)
|
|
|
- while True:
|
|
|
- line = input("请输入测试句子:")
|
|
|
- result = model.evaluate_line(sess, line)
|
|
|
- print(result)
|
|
|
+ # model.saver.restore(sess, 'model/ner1215.ckpt')
|
|
|
+ # model.saver.restore(sess, 'model/ner_f10.7039_loss1.2353.ckpt')
|
|
|
+ model.saver.restore(sess, 'model/ner_epoch10_f10.6875_loss1.5230.ckpt')
|
|
|
+ while True:
|
|
|
+ line = input("请输入测试句子:")
|
|
|
+ result = model.evaluate_line(sess, line)
|
|
|
+ print(result)
|
|
|
+ # ckpt = tf.train.get_checkpoint_state(ckpt_path)
|
|
|
+ # if ckpt and tf.train.checkpoint_exists(ckpt_path):
|
|
|
+ # print('模型文件:',ckpt.model_checkpoint_path)
|
|
|
+ # model.saver.restore(sess, ckpt.model_checkpoint_path)
|
|
|
+ # print(model.logits, model.lengths, model.trans, model.dropout, model.char_inputs)
|
|
|
+ # while True:
|
|
|
+ # line = input("请输入测试句子:")
|
|
|
+ # result = model.evaluate_line(sess, line)
|
|
|
+ # print(result)
|
|
|
def predict():
|
|
|
pb_path = "model/product.pb"
|
|
|
with tf.Graph().as_default():
|
|
@@ -111,7 +135,86 @@ def predict():
|
|
|
result = result_to_json(line, tags)
|
|
|
print(result)
|
|
|
|
|
|
+def predict_df():
|
|
|
+ ckpt_path = "model"
|
|
|
+ import json
|
|
|
+ with tf.Session() as sess:
|
|
|
+ model = Product_Model()
|
|
|
+ sess.run(tf.global_variables_initializer())
|
|
|
+ ckpt = tf.train.get_checkpoint_state(ckpt_path)
|
|
|
+ # model.saver.restore(sess, 'model/ner2.ckpt')
|
|
|
+ # model.saver.restore(sess, 'model/ner1201_find_lb.ckpt') # f1:0.6972, precision:0.7403, recall:0.6588, evl_loss:1.2983 model saved, val_loss is: 1.32706
|
|
|
+ # model.saver.restore(sess, 'model/ner1208_find_lb.ckpt') # f1:0.7038, precision:0.7634, recall:0.6528, evl_loss:1.3046 model saved, val_loss is: 1.29316
|
|
|
+ # model.saver.restore(sess, 'model/ner_f10.7039_loss1.2353.ckpt') # f1:0.70 ner1215
|
|
|
+ model.saver.restore(sess, 'model/ner_epoch4_f10.6952_loss1.2512.ckpt') # f1:0.70 ner1215
|
|
|
+
|
|
|
+ print(model.logits, model.lengths, model.trans, model.dropout, model.char_inputs)
|
|
|
+ # df = pd.read_excel('../test/data/贵州数据新字段提取信息_predict.xlsx')
|
|
|
+ # df = pd.read_excel('../test/data/所有产品标注数据_补充筛选废标原因数据.xlsx')
|
|
|
+ # df = pd.read_excel('../test/data/所有产品标注数据筛选_废标_predict.xlsx')
|
|
|
+ df = pd.read_excel('../test/data/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
|
|
|
+ # df = pd.read_excel('data/所有产品标注数据筛选测试数据2021-12-01_pred.xlsx')
|
|
|
+ df.reset_index(drop=True, inplace=True)
|
|
|
+ rs = []
|
|
|
+ for i in df.index:
|
|
|
+ line = df.loc[i, 'text']
|
|
|
+ pos = df.loc[i, 'pos']
|
|
|
+ reason = df.loc[i, 'reasons_label']
|
|
|
+ if pos==0 or reason!='[]':
|
|
|
+ rs.append('')
|
|
|
+ continue
|
|
|
+ # if i > 200:
|
|
|
+ # rs.append('')
|
|
|
+ # continue
|
|
|
+ # line = df.loc[i, 'process_text']
|
|
|
+ result = model.evaluate_line(sess, line)
|
|
|
+ print(result[0][1])
|
|
|
+ rs.append(json.dumps(result[0][1], ensure_ascii=False))
|
|
|
+ # df['pred_new1202'] = pd.Series(rs)
|
|
|
+ df['reson_model'] = pd.Series(rs)
|
|
|
+ # df.to_excel('../test/data/贵州数据新字段提取信息_predict.xlsx')
|
|
|
+ # df.to_excel('../test/data/所有产品标注数据_补充筛选废标原因数据_predict.xlsx')
|
|
|
+ # df.to_excel('../test/data/所有产品标注数据筛选_废标_predict.xlsx')
|
|
|
+ df.to_excel('../test/data/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
|
|
|
+ # df.to_excel('data/所有产品标注数据筛选测试数据2021-12-01_pred.xlsx')
|
|
|
+
|
|
|
if __name__ == "__main__":
|
|
|
# train()
|
|
|
- # evaluate_line()
|
|
|
- predict()
|
|
|
+ evaluate_line()
|
|
|
+ # predict()
|
|
|
+ # predict_df()
|
|
|
+ # import json
|
|
|
+ # df = pd.read_excel('data/所有产品标注数据筛选测试数据2021-12-01_pred.xlsx')
|
|
|
+ # old_new = []
|
|
|
+ # new_old = []
|
|
|
+ # df['old-new'] = df.apply(lambda x:set([str(it) for it in json.loads(x['pred_old'])])-set([str(it) for it in json.loads(x['pred_new'])]), axis=1)
|
|
|
+ # df['new-old'] = df.apply(lambda x:set([str(it) for it in json.loads(x['pred_new'])])-set([str(it) for it in json.loads(x['pred_old'])]), axis=1)
|
|
|
+ # df['old=new'] = df.apply(lambda x: 1 if x['old-new']==x['new-old'] else 0, axis=1)
|
|
|
+ # df.to_excel('data/所有产品标注数据筛选测试数据2021-12-01_pred.xlsx')
|
|
|
+
|
|
|
+
|
|
|
+ # with open('data/dev_data2.pkl', 'rb') as f:
|
|
|
+ # dev_data = pickle.load(f)
|
|
|
+ # import json
|
|
|
+ # df_dev = pd.read_excel('data/产品数据自己人标注的原始数据.xlsx')[:]
|
|
|
+ # def rows2lb(rows):
|
|
|
+ # rows = json.loads(rows)
|
|
|
+ # rows = list(set([it[0].split()[-1] for it in rows]))
|
|
|
+ # return json.dumps(rows, ensure_ascii=False)
|
|
|
+ # df_dev['lbset'] = df_dev['rows'].apply(lambda x:rows2lb(x))
|
|
|
+ # dev_data = dfsearchlb(df_dev)
|
|
|
+ # dev_manager = BatchManager(dev_data, batch_size=64)
|
|
|
+ # # ckpt_path = "model/ner0305.ckpt" #f1:0.7304, precision:0.8092, recall:0.6656, evl_loss:2.2160
|
|
|
+ # # ckpt_path = "model/ner0316.ckpt" #f1:0.7220, precision:0.7854, recall:0.6681, evl_loss:2.2921
|
|
|
+ # # ckpt_path = "model/ner2.ckpt" # f1:0.8019, precision:0.8541, recall:0.7557, evl_loss:1.6286
|
|
|
+ # # ckpt_path = "model/ner1029.ckpt" #f1:0.6374, precision:0.6897, recall:0.5924, evl_loss:2.0840
|
|
|
+ # # ckpt_path = "model/ner1129.ckpt" #f1:0.6034, precision:0.6931, recall:0.5343, evl_loss:1.9704
|
|
|
+ # ckpt_path = "model/ner1129.ckpt" #f1:0.6034, precision:0.6931, recall:0.5343, evl_loss:1.9704
|
|
|
+ # with tf.Session() as sess:
|
|
|
+ # model = Product_Model()
|
|
|
+ # sess.run(tf.global_variables_initializer())
|
|
|
+ # model.saver.restore(sess, ckpt_path)
|
|
|
+ # print("从文件加载原来模型数据",ckpt_path)
|
|
|
+ # f1, precision, recall, evl_loss = model.evaluate(sess, data_manager=dev_manager, id_to_tag=id_to_tag)
|
|
|
+ # print('f1:%.4f, precision:%.4f, recall:%.4f, evl_loss:%.4f' % (f1, precision, recall, evl_loss))
|
|
|
+
|