123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- '''
- Created on 2019年4月15日
- @author: User
- '''
- import os
- import sys
- sys.path.append(os.path.abspath("../.."))
- import pandas as pd
- import gensim
- import numpy as np
- import math
- import models
- from keras.callbacks import ModelCheckpoint
- from BiddingKG.dl.common.Utils import *
- def embedding(datas,shape):
- '''
- @summary:查找词汇对应的词向量
- @param:
- datas:词汇的list
- shape:结果的shape
- @return: array,返回对应shape的词嵌入
- '''
- model_w2v = getModel_word()
- embed = np.zeros(shape)
- length = shape[1]
- out_index = 0
- #print(datas)
- for data in datas:
- index = 0
- for item in str(data)[-shape[1]:]:
- if index>=length:
- break
- if item in model_w2v.vocab:
- embed[out_index][index] = model_w2v[item]
- index += 1
- else:
- #embed[out_index][index] = model_w2v['unk']
- index += 1
- out_index += 1
- return embed
- def labeling(label,out_len=2):
- out = np.zeros((out_len))
- out[label] = 1
- return out
-
- def getTrainData(percent=0.9):
-
- train_x = []
- train_y = []
- test_x = []
- test_y = []
-
- files = ["批量.xls","剩余手工标注.xls"]
-
- for file in files:
- df = pd.read_excel(file)
-
- for before,text,after,label in zip(df["list_before"],df["list_code"],df["list_after"],df["list_label"]):
- the_label = 0
- if not math.isnan(label):
- the_label = int(label)
- if the_label not in [0,1]:
- print(after,text)
- continue
- x = embedding([before,text,after],shape=(3,40,60))
- y = labeling(the_label)
- if np.random.random()<percent:
- train_x.append(x)
- train_y.append(y)
- else:
- test_x.append(x)
- test_y.append(y)
- return np.transpose(np.array(train_x),(1,0,2,3)),np.array(train_y),np.transpose(np.array(test_x),(1,0,2,3)),np.array(test_y)
- def train():
- #train_x,train_y,test_x,test_y = getTrainData()
- #save((train_x,train_y,test_x,test_y),"data.pk")
-
- train_x,train_y,test_x,test_y = load("data.pk")
- model = models.getTextCNNModel()
-
-
- # model.load_weights("log/ep012-loss0.049-val_loss0.071-f1_score0.979.h5")
-
- callback = ModelCheckpoint(filepath="log/"+"ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",monitor="val_loss",save_best_only=True, save_weights_only=True, mode="min")
- model.fit(x=[train_x[0],train_x[1],train_x[2]],y=train_y,batch_size=24,epochs=400,callbacks=[callback],validation_data=[[test_x[0],test_x[1],test_x[2]],test_y])
-
- def test():
- model = models.getTextCNNModel()
- model.load_weights("models/model_code.h5")
- model.save("model_code.h5")
-
- if __name__=="__main__":
- train()
- #test()
|