#from general_data import getTokensLabels import sys import os sys.path.append(os.path.abspath("../..")) # from model import * from keras.callbacks import ModelCheckpoint from keras import layers,models,optimizers,losses import psycopg2 from BiddingKG.dl.common.Utils import * import pandas as pd from BiddingKG.dl.common.models import * # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # os.environ["CUDA_VISIBLE_DEVICES"] = "" sourcetable = "label_guest_person" domain = sourcetable.split("_")[2] model_file = "model_"+domain+".model" input_shape = (2,10,128) output_shape = [5] def getTokensLabels(t,isTrain=True): ''' @summary: 取得模型的输入输出数据 @param: t:标签数据所在表 @return: type:array,array,list meaning:输入,输出,实体id ''' conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() if isTrain: sql = " select B.tokens,A.begin_index,A.end_index,C.label,A.entity_id from train_entity_copy A,train_sentences_copy B,"+t+" C where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and A.entity_type='person' and A.entity_id=C.entity_id and C.entity_id not in (select entity_id from "+t+" order by entity_id limit 2000)" else: sql = " select B.tokens,A.begin_index,A.end_index,C.label,A.entity_id from train_entity_copy A,train_sentences_copy B,"+t+" C where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and A.entity_type='person' and A.entity_id=C.entity_id and C.entity_id in (select entity_id from "+t+" order by entity_id limit 2000)" cursor.execute(sql) print(sql) data_x = [] data_y = [] data_context = [] rows = cursor.fetchmany(1000) allLimit = 250000 all = 0 i = 0 while(rows): for row in rows: if all>=allLimit: break item_x = embedding(spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=input_shape[1]),shape=input_shape) # item_x = encodeInput(spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=10), word_len=50, word_flag=True,userFool=False) # _span = spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=10,word_flag=False) # item_x = encodeInput(_span, word_len=10, word_flag=False,userFool=False) item_y = np.zeros(output_shape) item_y[row[3]] = 1 all += 1 if not isTrain: item_context = [] item_context.append(row[4]) data_context.append(item_context) data_x.append(item_x) data_y.append(item_y) i += 1 rows = cursor.fetchmany(1000) return np.transpose(np.array(data_x),(1,0,2,3)),np.array(data_y),data_context def getBiRNNModel(): ''' @summary: 获得模型 ''' L_input = layers.Input(shape=input_shape[1:],dtype="float32") #C_input = layers.Input(shape=(10,128),dtype="float32") R_input = layers.Input(shape=input_shape[1:],dtype="float32") #lstm_0 = layers.Bidirectional(layers.LSTM(16,return_sequences=True))(ThreeBilstm(0)(input)) lstm_0 = layers.Bidirectional(layers.LSTM(32,return_sequences=True))(L_input) avg_0 = layers.GlobalAveragePooling1D()(lstm_0) #lstm_1 = layers.Bidirectional(layers.LSTM(16,return_sequences=True))(C_input) #avg_1 = layers.GlobalAveragePooling1D()(lstm_1) lstm_2 = layers.Bidirectional(layers.LSTM(32,return_sequences=True))(R_input) avg_2 = layers.GlobalAveragePooling1D()(lstm_2) #concat = layers.merge([avg_0,avg_1,avg_2],mode="concat") concat = layers.merge([avg_0,avg_2],mode="concat") output = layers.Dense(output_shape[0],activation="softmax")(concat) model = models.Model(inputs=[L_input,R_input],outputs=output) model.compile(optimizer=optimizers.Adam(lr=0.0005),loss=losses.binary_crossentropy,metrics=[precision,recall,f1_score]) return model def training(): ''' @summary: 训练模型 ''' model = getBiRNNModel() model.summary() train_x,train_y,_ = getTokensLabels(isTrain=True,t="hand_label_person") #print(np.shape(train_x)) test_x,test_y,test_context = getTokensLabels(isTrain=False,t="hand_label_person") checkpoint = ModelCheckpoint(model_file+".hdf5",monitor="val_loss",verbose=1,save_best_only=True,mode='min') history_model = model.fit(x=[train_x[0],train_x[1]],y=train_y,validation_data=([test_x[0],test_x[1]],test_y),epochs=100,batch_size=256,shuffle=True,callbacks=[checkpoint]) # predict_y = model.predict([test_x[0],test_x[1]]) # # conn = psycopg2.connect(dbname='BiddingKG', user='postgres',password='postgres',host='192.168.2.101') # cursor = conn.cursor() # table = 'predict_person' # cursor.execute(" select to_regclass('"+table+"') is null ") # notExists = cursor.fetchall()[0][0] # if notExists: # cursor.execute(" create table "+table+" (entity_id text,predect int,label int)") # else: # cursor.execute(" delete from "+table) # # # # with open("predict.txt","w",encoding="utf8") as f: # for i in range(len(predict_y)): # if np.argmax(predict_y[i]) != np.argmax(test_y[i]): # f.write("\n") # f.write(str(test_context[i][0])) # f.write("\t") # f.write(str(np.argmax(predict_y[i]))) # f.write("\t") # f.write(str(np.argmax(test_y[i]))) # f.write("\n") # sql = " insert into "+table+"(entity_id ,predect ,label) values('"+str(test_context[i][0])+"','"+str(int(np.argmax(predict_y[i])))+"','"+str(int(np.argmax(test_y[i])))+"')" # # print(sql) # cursor.execute(sql) # conn.commit() # cursor.close() # conn.close() # f.flush() # f.close() #print_metrics(history_model) def train(): train_x,train_y,_ = getTokensLabels(isTrain=True,t="hand_label_person") test_x,test_y,test_context = getTokensLabels(isTrain=False,t="hand_label_person") with tf.Session() as sess: vocab,matrix = getVocabAndMatrix(getModel_w2v(),Embedding_size=128) model = getBiLSTMModel(input_shape=(2,10,128), vocab=vocab, embedding_weights=matrix, classes=4) callback = ModelCheckpoint(filepath="log/"+"ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",monitor="val_loss",save_best_only=True, save_weights_only=True, mode="min") model.fit(x=[train_x[0],train_x[1]],y=train_y,batch_size=128,epochs=600,callbacks=[callback],validation_data=[[test_x[0],test_x[1]],test_y]) def predict(): ''' @summary: 预测数据 ''' def getTokensLabels(): conn = psycopg2.connect(dbname="BidiPro",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() #sql = ''' #SELECT s.tokens,e.begin_index,e.end_index,e.doc_id,e.entity_id,e.sentence_index,e.entity_text,e.entity_type from entity_mention e,sentences s #WHERE s.doc_id=e.doc_id AND s.sentence_index=e.sentence_index AND e.entity_id not in (SELECT entity_id from entity_label) and entity_type in ('person') limit 10000 #''' sql = ''' SELECT s.tokens,e.begin_index,e.end_index,e.doc_id,e.entity_id,e.sentence_index,e.entity_text,e.entity_type from entity_mention e,sentences s WHERE s.doc_id=e.doc_id AND s.sentence_index=e.sentence_index AND e.doc_id in(SELECT doc_id from articles_validation) and entity_type in ('person') ''' cursor.execute(sql) print(sql) data_x = [] doc_id = [] ent_id = [] sen = [] ent_text = [] dianhua = [] rows = cursor.fetchmany(1000) key_word = re.compile('电话[:|:]\d{7,12}|联系方式[:|:]\d{7,12}') phone = re.compile('1[3|4|5|7|8][0-9][-|——|—]?\d{4}[-|——|—]?\d{4}|\d{3,4}[-|——|—]\d{7,8}/\d{3,8}|\d{3,4}[-|——|—]\d{7,8}转\d{1,4}|\d{3,4}[-|——|—]\d{7,8}|[\(|\(]0\d{2,3}[\)|\)]\d{7,8}') # 联系电话 while(rows): for row in rows: item_x = embedding(spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2])) s = spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=15) s2 = ''.join(s[1]) s2 = re.sub(',)', '-', s2) s2 = re.sub('\s','',s2) have_key = re.findall(key_word, s2) have_phone = re.findall(phone, s2) if have_phone: dianhua.append(have_phone) elif have_key: dianhua.append(have_phone) else: dianhua.append('') sen.append(s2) ent_id.append(row[4]) ent_text.append(row[6]) data_x.append(item_x) doc_id.append(row[3]) rows = cursor.fetchmany(1000) cursor.close() conn.close() return np.transpose(np.array(data_x),(1,0,2,3)),doc_id,ent_id,sen,ent_text,dianhua test_x,doc_id,ent_id,sen,ent_text,dianhua = getTokensLabels() model = models.load_model("model_person.model",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) predict_y = model.predict([test_x[0],test_x[1]]) label = [np.argmax(y) for y in predict_y] data = {'doc_id':doc_id, 'ent_id':ent_id, 'sen':sen, 'entity_text':ent_text, 'dianhua':dianhua, 'label':label} df = pd.DataFrame(data) df.to_excel('data/person_phone.xls') conn = psycopg2.connect(dbname='BidiPro', user='postgres',password='postgres',host='192.168.2.101') cursor = conn.cursor() table = 'person_phone_predict' cursor.execute(" select to_regclass('"+table+"') is null ") notExists = cursor.fetchall()[0][0] if notExists: cursor.execute(" create table "+table+" (doc_id text,entity_id text,entity text,label int,predict text,phone text)") else: cursor.execute(" delete from "+table) for i in range(len(df['ent_id'])): pre_y = [str(a) for a in predict_y[i]] sql = " insert into "+table+"(doc_id,entity_id,entity,label,predict,phone) values('"+str(df['doc_id'][i])+"','"+str(df['ent_id'][i])+"','"+str(df['entity_text'][i])+"',"+str(int(df['label'][i]))+",'"+str(','.join(pre_y))+"','"+str(','.join(df['dianhua'][i]))+"')" #print(sql) cursor.execute(sql) conn.commit() print('提交完成') cursor.close() conn.close() if __name__ == '__main__': #get_data() #label_data() #post_data() training() predict() # train()