''' Created on 2019年4月15日 @author: User ''' import os import sys sys.path.append(os.path.abspath("../..")) import pandas as pd import gensim import numpy as np import math import models from keras.callbacks import ModelCheckpoint from BiddingKG.dl.common.Utils import * def embedding(datas,shape): ''' @summary:查找词汇对应的词向量 @param: datas:词汇的list shape:结果的shape @return: array,返回对应shape的词嵌入 ''' model_w2v = getModel_word() embed = np.zeros(shape) length = shape[1] out_index = 0 #print(datas) for data in datas: index = 0 for item in str(data)[-shape[1]:]: if index>=length: break if item in model_w2v.vocab: embed[out_index][index] = model_w2v[item] index += 1 else: #embed[out_index][index] = model_w2v['unk'] index += 1 out_index += 1 return embed def labeling(label,out_len=2): out = np.zeros((out_len)) out[label] = 1 return out def getTrainData(percent=0.9): train_x = [] train_y = [] test_x = [] test_y = [] files = ["批量.xls","剩余手工标注.xls"] for file in files: df = pd.read_excel(file) for before,text,after,label in zip(df["list_before"],df["list_code"],df["list_after"],df["list_label"]): the_label = 0 if not math.isnan(label): the_label = int(label) if the_label not in [0,1]: print(after,text) continue x = embedding([before,text,after],shape=(3,40,60)) y = labeling(the_label) if np.random.random()