123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239 |
- #from general_data import getTokensLabels
- import sys
- import os
- sys.path.append(os.path.abspath("../.."))
- # from model import *
- from keras.callbacks import ModelCheckpoint
- from keras import layers,models,optimizers,losses
- import psycopg2
- from BiddingKG.dl.common.Utils import *
- import pandas as pd
- from BiddingKG.dl.common.models import *
- # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
- # os.environ["CUDA_VISIBLE_DEVICES"] = ""
- sourcetable = "label_guest_person"
- domain = sourcetable.split("_")[2]
- model_file = "model_"+domain+".model"
- input_shape = (2,10,128)
- output_shape = [5]
- def getTokensLabels(t,isTrain=True):
- '''
- @summary: 取得模型的输入输出数据
- @param:
- t:标签数据所在表
- @return: type:array,array,list meaning:输入,输出,实体id
- '''
- conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- if isTrain:
-
- sql = " select B.tokens,A.begin_index,A.end_index,C.label,A.entity_id from train_entity_copy A,train_sentences_copy B,"+t+" C where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and A.entity_type='person' and A.entity_id=C.entity_id and C.entity_id not in (select entity_id from "+t+" order by entity_id limit 2000)"
- else:
- sql = " select B.tokens,A.begin_index,A.end_index,C.label,A.entity_id from train_entity_copy A,train_sentences_copy B,"+t+" C where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and A.entity_type='person' and A.entity_id=C.entity_id and C.entity_id in (select entity_id from "+t+" order by entity_id limit 2000)"
- cursor.execute(sql)
-
- print(sql)
-
- data_x = []
- data_y = []
- data_context = []
-
- rows = cursor.fetchmany(1000)
- allLimit = 250000
- all = 0
- i = 0
- while(rows):
- for row in rows:
- if all>=allLimit:
- break
- item_x = embedding(spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=input_shape[1]),shape=input_shape)
- # item_x = encodeInput(spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=10), word_len=50, word_flag=True,userFool=False)
- # _span = spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=10,word_flag=False)
- # item_x = encodeInput(_span, word_len=10, word_flag=False,userFool=False)
- item_y = np.zeros(output_shape)
- item_y[row[3]] = 1
- all += 1
-
- if not isTrain:
- item_context = []
- item_context.append(row[4])
- data_context.append(item_context)
- data_x.append(item_x)
- data_y.append(item_y)
- i += 1
- rows = cursor.fetchmany(1000)
- return np.transpose(np.array(data_x),(1,0,2,3)),np.array(data_y),data_context
- def getBiRNNModel():
- '''
- @summary: 获得模型
- '''
- L_input = layers.Input(shape=input_shape[1:],dtype="float32")
- #C_input = layers.Input(shape=(10,128),dtype="float32")
- R_input = layers.Input(shape=input_shape[1:],dtype="float32")
- #lstm_0 = layers.Bidirectional(layers.LSTM(16,return_sequences=True))(ThreeBilstm(0)(input))
- lstm_0 = layers.Bidirectional(layers.LSTM(32,return_sequences=True))(L_input)
- avg_0 = layers.GlobalAveragePooling1D()(lstm_0)
- #lstm_1 = layers.Bidirectional(layers.LSTM(16,return_sequences=True))(C_input)
- #avg_1 = layers.GlobalAveragePooling1D()(lstm_1)
- lstm_2 = layers.Bidirectional(layers.LSTM(32,return_sequences=True))(R_input)
- avg_2 = layers.GlobalAveragePooling1D()(lstm_2)
- #concat = layers.merge([avg_0,avg_1,avg_2],mode="concat")
- concat = layers.merge([avg_0,avg_2],mode="concat")
-
- output = layers.Dense(output_shape[0],activation="softmax")(concat)
-
- model = models.Model(inputs=[L_input,R_input],outputs=output)
- model.compile(optimizer=optimizers.Adam(lr=0.0005),loss=losses.binary_crossentropy,metrics=[precision,recall,f1_score])
- return model
- def training():
- '''
- @summary: 训练模型
- '''
- model = getBiRNNModel()
- model.summary()
- train_x,train_y,_ = getTokensLabels(isTrain=True,t="hand_label_person")
- #print(np.shape(train_x))
- test_x,test_y,test_context = getTokensLabels(isTrain=False,t="hand_label_person")
- checkpoint = ModelCheckpoint(model_file+".hdf5",monitor="val_loss",verbose=1,save_best_only=True,mode='min')
- history_model = model.fit(x=[train_x[0],train_x[1]],y=train_y,validation_data=([test_x[0],test_x[1]],test_y),epochs=100,batch_size=256,shuffle=True,callbacks=[checkpoint])
- # predict_y = model.predict([test_x[0],test_x[1]])
- #
- # conn = psycopg2.connect(dbname='BiddingKG', user='postgres',password='postgres',host='192.168.2.101')
- # cursor = conn.cursor()
- # table = 'predict_person'
- # cursor.execute(" select to_regclass('"+table+"') is null ")
- # notExists = cursor.fetchall()[0][0]
- # if notExists:
- # cursor.execute(" create table "+table+" (entity_id text,predect int,label int)")
- # else:
- # cursor.execute(" delete from "+table)
- #
- #
- #
- # with open("predict.txt","w",encoding="utf8") as f:
- # for i in range(len(predict_y)):
- # if np.argmax(predict_y[i]) != np.argmax(test_y[i]):
- # f.write("\n")
- # f.write(str(test_context[i][0]))
- # f.write("\t")
- # f.write(str(np.argmax(predict_y[i])))
- # f.write("\t")
- # f.write(str(np.argmax(test_y[i])))
- # f.write("\n")
- # sql = " insert into "+table+"(entity_id ,predect ,label) values('"+str(test_context[i][0])+"','"+str(int(np.argmax(predict_y[i])))+"','"+str(int(np.argmax(test_y[i])))+"')"
- # # print(sql)
- # cursor.execute(sql)
- # conn.commit()
- # cursor.close()
- # conn.close()
- # f.flush()
- # f.close()
-
-
- #print_metrics(history_model)
- def train():
- train_x,train_y,_ = getTokensLabels(isTrain=True,t="hand_label_person")
- test_x,test_y,test_context = getTokensLabels(isTrain=False,t="hand_label_person")
- with tf.Session() as sess:
- vocab,matrix = getVocabAndMatrix(getModel_w2v(),Embedding_size=128)
- model = getBiLSTMModel(input_shape=(2,10,128), vocab=vocab, embedding_weights=matrix, classes=4)
- callback = ModelCheckpoint(filepath="log/"+"ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",monitor="val_loss",save_best_only=True, save_weights_only=True, mode="min")
- model.fit(x=[train_x[0],train_x[1]],y=train_y,batch_size=128,epochs=600,callbacks=[callback],validation_data=[[test_x[0],test_x[1]],test_y])
- def predict():
- '''
- @summary: 预测数据
- '''
- def getTokensLabels():
- conn = psycopg2.connect(dbname="BidiPro",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- #sql = '''
- #SELECT s.tokens,e.begin_index,e.end_index,e.doc_id,e.entity_id,e.sentence_index,e.entity_text,e.entity_type from entity_mention e,sentences s
- #WHERE s.doc_id=e.doc_id AND s.sentence_index=e.sentence_index AND e.entity_id not in (SELECT entity_id from entity_label) and entity_type in ('person') limit 10000
- #'''
- sql = '''
- SELECT s.tokens,e.begin_index,e.end_index,e.doc_id,e.entity_id,e.sentence_index,e.entity_text,e.entity_type from entity_mention e,sentences s
- WHERE s.doc_id=e.doc_id AND s.sentence_index=e.sentence_index AND e.doc_id in(SELECT doc_id from articles_validation) and entity_type in ('person')
- '''
- cursor.execute(sql)
- print(sql)
- data_x = []
- doc_id = []
- ent_id = []
- sen = []
- ent_text = []
- dianhua = []
- rows = cursor.fetchmany(1000)
- key_word = re.compile('电话[:|:]\d{7,12}|联系方式[:|:]\d{7,12}')
- phone = re.compile('1[3|4|5|7|8][0-9][-|——|—]?\d{4}[-|——|—]?\d{4}|\d{3,4}[-|——|—]\d{7,8}/\d{3,8}|\d{3,4}[-|——|—]\d{7,8}转\d{1,4}|\d{3,4}[-|——|—]\d{7,8}|[\(|\(]0\d{2,3}[\)|\)]\d{7,8}') # 联系电话
- while(rows):
- for row in rows:
- item_x = embedding(spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2]))
- s = spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=15)
- s2 = ''.join(s[1])
- s2 = re.sub(',)', '-', s2)
- s2 = re.sub('\s','',s2)
- have_key = re.findall(key_word, s2)
- have_phone = re.findall(phone, s2)
-
- if have_phone:
- dianhua.append(have_phone)
- elif have_key:
- dianhua.append(have_phone)
- else:
- dianhua.append('')
- sen.append(s2)
- ent_id.append(row[4])
- ent_text.append(row[6])
- data_x.append(item_x)
- doc_id.append(row[3])
- rows = cursor.fetchmany(1000)
- cursor.close()
- conn.close()
- return np.transpose(np.array(data_x),(1,0,2,3)),doc_id,ent_id,sen,ent_text,dianhua
-
- test_x,doc_id,ent_id,sen,ent_text,dianhua = getTokensLabels()
- model = models.load_model("model_person.model",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- predict_y = model.predict([test_x[0],test_x[1]])
- label = [np.argmax(y) for y in predict_y]
- data = {'doc_id':doc_id, 'ent_id':ent_id, 'sen':sen, 'entity_text':ent_text, 'dianhua':dianhua, 'label':label}
- df = pd.DataFrame(data)
- df.to_excel('data/person_phone.xls')
- conn = psycopg2.connect(dbname='BidiPro', user='postgres',password='postgres',host='192.168.2.101')
- cursor = conn.cursor()
- table = 'person_phone_predict'
- cursor.execute(" select to_regclass('"+table+"') is null ")
- notExists = cursor.fetchall()[0][0]
- if notExists:
- cursor.execute(" create table "+table+" (doc_id text,entity_id text,entity text,label int,predict text,phone text)")
- else:
- cursor.execute(" delete from "+table)
-
- for i in range(len(df['ent_id'])):
- pre_y = [str(a) for a in predict_y[i]]
- sql = " insert into "+table+"(doc_id,entity_id,entity,label,predict,phone) values('"+str(df['doc_id'][i])+"','"+str(df['ent_id'][i])+"','"+str(df['entity_text'][i])+"',"+str(int(df['label'][i]))+",'"+str(','.join(pre_y))+"','"+str(','.join(df['dianhua'][i]))+"')"
- #print(sql)
- cursor.execute(sql)
- conn.commit()
- print('提交完成')
- cursor.close()
- conn.close()
- if __name__ == '__main__':
- #get_data()
- #label_data()
- #post_data()
- training()
- predict()
- # train()
|