#!/usr/bin/python3 # -*- coding: utf-8 -*- # @Author : bidikeji # @Time : 2021/1/13 0013 14:03 from BiddingKG.dl.product.product_model import Product_Model from BiddingKG.dl.product.data_util import BatchManager, get_label_data, id_to_tag, input_from_line, decode, result_to_json, df2data,dfsearchlb from BiddingKG.dl.product.data_process import data_precess import numpy as np import pandas as pd import tensorflow as tf import random import pickle import os import glob os.environ['CUDA_VISIBLE_DEVICES'] = "-1" def train(): # all_data = get_label_data() # random.shuffle(all_data) # train_data = all_data[:int(len(all_data)*0.85)] # dev_data = all_data[int(len(all_data)*0.85):] # df = pd.read_excel('data/所有产品标注数据筛选20211125.xlsx') # df.reset_index(drop=True, inplace=True) # np.random.seed(8) # shuffle_ids = np.random.permutation(len(df)) # split_ids = int(len(df)*0.1) # train_ids = shuffle_ids[split_ids:] # dev_ids = shuffle_ids[:int(split_ids/2)] # df_train = df.iloc[train_ids] # df_dev = df.iloc[dev_ids] # train_data = df2data(df_train) # dev_data = df2data(df_dev) # with open(os.path.dirname(__file__)+'/data/train_data2021-11-30.pkl', 'rb') as f: # train_data = pickle.load(f) # with open(os.path.dirname(__file__)+'data/dev_data2021-11-30.pkl', 'rb') as f: # dev_data = pickle.load(f) train_data, dev_data = data_precess() train_manager = BatchManager(train_data, batch_size=256) dev_manager = BatchManager(dev_data, batch_size=256) # tf_config = tf.ConfigProto() # tf_config.gpu_options.allow_growth = True tf_config = tf.ConfigProto(device_count={'gpu': 1}) steps_per_epoch = train_manager.len_data ckpt_path = os.path.dirname(__file__)+'/'+"model" with tf.Session(config=tf_config) as sess: model = Product_Model() sess.run(tf.global_variables_initializer()) model.saver.restore(sess, os.path.join(ckpt_path, "ner2.ckpt")) # ckpt = tf.train.get_checkpoint_state(ckpt_path) # if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): # model.saver.restore(sess, ckpt.model_checkpoint_path) # print("从文件加载原来模型数据",ckpt.model_checkpoint_path) print('准备训练数据') loss = [] mix_loss = 1000 max_f1 = 0 for i in range(20): print('epochs:',i) # model.evaluate(sess, data_manager=dev_manager, id_to_tag=id_to_tag) # break for batch in train_manager.iter_batch(shuffle=True): # print('batch:',len(batch)) # step, batch_loss = model.run_step(sess, True, batch) step, batch_loss = model.run_step(sess, 'train', batch) loss.append(batch_loss) if step % 1000 == 0: iteration = step // steps_per_epoch + 1 print('iter:{} step:{} loss:{}'.format(iteration, step, np.mean(loss))) if i >= 2 or i%5==0: f1, precision, recall, evl_loss = model.evaluate(sess, data_manager=dev_manager, id_to_tag=id_to_tag) print('f1:%.4f, precision:%.4f, recall:%.4f, evl_loss:%.4f' % (f1, precision, recall, evl_loss)) # if max_f1 < f1: # model.saver.save(sess, os.path.join(ckpt_path, "ner2.ckpt")) # print("model save .bast f1 is %.4f" % f1) # max_f1 = f1 if evl_loss 200: # rs.append('') # continue # line = df.loc[i, 'process_text'] result = model.evaluate_line(sess, line) print(result[0][1]) rs.append(json.dumps(result[0][1], ensure_ascii=False)) # df['pred_new1202'] = pd.Series(rs) # df['reson_model2'] = pd.Series(rs) df['product_pred'] = pd.Series(rs) # df.to_excel('../test/data/贵州数据新字段提取信息_predict.xlsx') # df.to_excel('../test/data/所有产品标注数据_补充筛选废标原因数据_predict.xlsx') # df.to_excel('../test/data/所有产品标注数据筛选_废标_predict.xlsx') # df.to_excel('../test/data/所有产品标注数据筛选20211125_ProductAndReason.xlsx') df.to_excel('data/产品数据自己人标注的原始数据_pred.xlsx') # df.to_excel('data/所有产品标注数据筛选测试数据2021-12-01_pred.xlsx') if __name__ == "__main__": # train() # evaluate_line() # save_model_pb() predict() # predict_df() # import json # df = pd.read_excel('data/所有产品标注数据筛选测试数据2021-12-01_pred.xlsx') # old_new = [] # new_old = [] # df['old-new'] = df.apply(lambda x:set([str(it) for it in json.loads(x['pred_old'])])-set([str(it) for it in json.loads(x['pred_new'])]), axis=1) # df['new-old'] = df.apply(lambda x:set([str(it) for it in json.loads(x['pred_new'])])-set([str(it) for it in json.loads(x['pred_old'])]), axis=1) # df['old=new'] = df.apply(lambda x: 1 if x['old-new']==x['new-old'] else 0, axis=1) # df.to_excel('data/所有产品标注数据筛选测试数据2021-12-01_pred.xlsx') # with open('data/dev_data2.pkl', 'rb') as f: # dev_data = pickle.load(f) # import json # df_dev = pd.read_excel('data/产品数据自己人标注的原始数据.xlsx')[:] # def rows2lb(rows): # rows = json.loads(rows) # rows = list(set([it[0].split()[-1] for it in rows])) # return json.dumps(rows, ensure_ascii=False) # df_dev['lbset'] = df_dev['rows'].apply(lambda x:rows2lb(x)) # dev_data = dfsearchlb(df_dev) # dev_manager = BatchManager(dev_data, batch_size=64) # # ckpt_path = "model/ner0305.ckpt" #f1:0.7304, precision:0.8092, recall:0.6656, evl_loss:2.2160 # # ckpt_path = "model/ner0316.ckpt" #f1:0.7220, precision:0.7854, recall:0.6681, evl_loss:2.2921 # # ckpt_path = "model/ner2.ckpt" # f1:0.8019, precision:0.8541, recall:0.7557, evl_loss:1.6286 # # ckpt_path = "model/ner1029.ckpt" #f1:0.6374, precision:0.6897, recall:0.5924, evl_loss:2.0840 # # ckpt_path = "model/ner1129.ckpt" #f1:0.6034, precision:0.6931, recall:0.5343, evl_loss:1.9704 # ckpt_path = "model/ner1129.ckpt" #f1:0.6034, precision:0.6931, recall:0.5343, evl_loss:1.9704 # with tf.Session() as sess: # model = Product_Model() # sess.run(tf.global_variables_initializer()) # model.saver.restore(sess, ckpt_path) # print("从文件加载原来模型数据",ckpt_path) # f1, precision, recall, evl_loss = model.evaluate(sess, data_manager=dev_manager, id_to_tag=id_to_tag) # print('f1:%.4f, precision:%.4f, recall:%.4f, evl_loss:%.4f' % (f1, precision, recall, evl_loss))