luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
							#from general_data import getTokensLabels

import sys
import os
sys.path.append(os.path.abspath("../.."))
# from model import *
from keras.callbacks import ModelCheckpoint
from keras import layers,models,optimizers,losses
import psycopg2
from BiddingKG.dl.common.Utils import *
import pandas as pd
from BiddingKG.dl.common.models import *

# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

sourcetable = "label_guest_person"
domain = sourcetable.split("_")[2]
model_file = "model_"+domain+".model"
input_shape = (2,10,128)
output_shape = [5]

def getTokensLabels(t,isTrain=True):
    '''
    @summary: 取得模型的输入输出数据
    @param:
        t:标签数据所在表
    @return: type:array,array,list meaning:输入，输出，实体id
    '''
    conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    if isTrain:
        
        sql = " select B.tokens,A.begin_index,A.end_index,C.label,A.entity_id from train_entity_copy A,train_sentences_copy B,"+t+" C where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and A.entity_type='person' and A.entity_id=C.entity_id and C.entity_id not in (select entity_id from "+t+" order by entity_id limit 2000)"
    else:
        sql = " select B.tokens,A.begin_index,A.end_index,C.label,A.entity_id from train_entity_copy A,train_sentences_copy B,"+t+" C where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and A.entity_type='person' and A.entity_id=C.entity_id and C.entity_id in (select entity_id from "+t+" order by entity_id limit 2000)"     

    cursor.execute(sql)
    
    print(sql)
    
    data_x = []
    data_y = []
    data_context = []
    
    rows = cursor.fetchmany(1000)
    allLimit = 250000
    all = 0
    i = 0
    while(rows):
        for row in rows:
            if all>=allLimit:
                break
            item_x = embedding(spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=input_shape[1]),shape=input_shape)
            # item_x = encodeInput(spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=10), word_len=50, word_flag=True,userFool=False)

            # _span = spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=10,word_flag=False)
            # item_x = encodeInput(_span, word_len=10, word_flag=False,userFool=False)
            item_y = np.zeros(output_shape)
            item_y[row[3]] = 1
            all += 1
            
            if not isTrain:
                item_context = []
                item_context.append(row[4])
                data_context.append(item_context)
            data_x.append(item_x)
            data_y.append(item_y)
            i += 1        
        rows = cursor.fetchmany(1000)
    return np.transpose(np.array(data_x),(1,0,2,3)),np.array(data_y),data_context

def getBiRNNModel():
    '''
    @summary: 获得模型
    '''
    L_input = layers.Input(shape=input_shape[1:],dtype="float32")
    #C_input = layers.Input(shape=(10,128),dtype="float32")
    R_input = layers.Input(shape=input_shape[1:],dtype="float32")
    #lstm_0 = layers.Bidirectional(layers.LSTM(16,return_sequences=True))(ThreeBilstm(0)(input))
    lstm_0 = layers.Bidirectional(layers.LSTM(32,return_sequences=True))(L_input)
    avg_0 = layers.GlobalAveragePooling1D()(lstm_0)
    #lstm_1 = layers.Bidirectional(layers.LSTM(16,return_sequences=True))(C_input)
    #avg_1 = layers.GlobalAveragePooling1D()(lstm_1)
    lstm_2 = layers.Bidirectional(layers.LSTM(32,return_sequences=True))(R_input)
    avg_2 = layers.GlobalAveragePooling1D()(lstm_2)
    #concat = layers.merge([avg_0,avg_1,avg_2],mode="concat")
    concat = layers.merge([avg_0,avg_2],mode="concat")
    
    output = layers.Dense(output_shape[0],activation="softmax")(concat)
    
    model = models.Model(inputs=[L_input,R_input],outputs=output)
    model.compile(optimizer=optimizers.Adam(lr=0.0005),loss=losses.binary_crossentropy,metrics=[precision,recall,f1_score])
    return model

def training():
    '''
    @summary: 训练模型
    '''
    model = getBiRNNModel()
    model.summary()
    train_x,train_y,_ = getTokensLabels(isTrain=True,t="hand_label_person")
    #print(np.shape(train_x))
    test_x,test_y,test_context = getTokensLabels(isTrain=False,t="hand_label_person")
    checkpoint = ModelCheckpoint(model_file+".hdf5",monitor="val_loss",verbose=1,save_best_only=True,mode='min')
    history_model = model.fit(x=[train_x[0],train_x[1]],y=train_y,validation_data=([test_x[0],test_x[1]],test_y),epochs=100,batch_size=256,shuffle=True,callbacks=[checkpoint])
#     predict_y = model.predict([test_x[0],test_x[1]])
#
#     conn = psycopg2.connect(dbname='BiddingKG', user='postgres',password='postgres',host='192.168.2.101')
#     cursor = conn.cursor()
#     table = 'predict_person'
#     cursor.execute(" select to_regclass('"+table+"') is null ")
#     notExists = cursor.fetchall()[0][0]
#     if notExists:
#         cursor.execute(" create table "+table+" (entity_id text,predect int,label int)")
#     else:
#         cursor.execute(" delete from "+table)
#
#
#
#     with open("predict.txt","w",encoding="utf8") as f:
#         for i in range(len(predict_y)):
#             if np.argmax(predict_y[i]) != np.argmax(test_y[i]):
#                 f.write("\n")
#                 f.write(str(test_context[i][0]))
#                 f.write("\t")
#                 f.write(str(np.argmax(predict_y[i])))
#                 f.write("\t")
#                 f.write(str(np.argmax(test_y[i])))
#                 f.write("\n")
#                 sql = " insert into "+table+"(entity_id ,predect ,label) values('"+str(test_context[i][0])+"','"+str(int(np.argmax(predict_y[i])))+"','"+str(int(np.argmax(test_y[i])))+"')"
# #                 print(sql)
#                 cursor.execute(sql)
#         conn.commit()
#         cursor.close()
#         conn.close()
#         f.flush()
#         f.close()
       
    
    #print_metrics(history_model)

def train():
    train_x,train_y,_ = getTokensLabels(isTrain=True,t="hand_label_person")
    test_x,test_y,test_context = getTokensLabels(isTrain=False,t="hand_label_person")
    with tf.Session() as sess:
        vocab,matrix = getVocabAndMatrix(getModel_w2v(),Embedding_size=128)
        model = getBiLSTMModel(input_shape=(2,10,128), vocab=vocab, embedding_weights=matrix, classes=4)
        callback = ModelCheckpoint(filepath="log/"+"ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",monitor="val_loss",save_best_only=True, save_weights_only=True, mode="min")
        model.fit(x=[train_x[0],train_x[1]],y=train_y,batch_size=128,epochs=600,callbacks=[callback],validation_data=[[test_x[0],test_x[1]],test_y])

def predict():
    '''
    @summary: 预测数据
    '''
    def getTokensLabels():
        conn = psycopg2.connect(dbname="BidiPro",user="postgres",password="postgres",host="192.168.2.101")
        cursor = conn.cursor()
        #sql = '''
       #SELECT s.tokens,e.begin_index,e.end_index,e.doc_id,e.entity_id,e.sentence_index,e.entity_text,e.entity_type from entity_mention e,sentences s
       #WHERE s.doc_id=e.doc_id AND s.sentence_index=e.sentence_index AND e.entity_id not in (SELECT entity_id from entity_label) and entity_type in ('person') limit 10000
        #'''
        sql = '''
       SELECT s.tokens,e.begin_index,e.end_index,e.doc_id,e.entity_id,e.sentence_index,e.entity_text,e.entity_type from entity_mention e,sentences s
       WHERE s.doc_id=e.doc_id AND s.sentence_index=e.sentence_index AND e.doc_id in(SELECT doc_id from articles_validation) and entity_type in ('person')
        '''       
        cursor.execute(sql)     
        print(sql)        
        data_x = []
        doc_id = []
        ent_id = []
        sen = []
        ent_text = []
        dianhua = []        
        rows = cursor.fetchmany(1000)       
        key_word = re.compile('电话[：|:]\d{7,12}|联系方式[：|:]\d{7,12}')
        phone = re.compile('1[3|4|5|7|8][0-9][-|——|—]?\d{4}[-|——|—]?\d{4}|\d{3,4}[-|——|—]\d{7,8}/\d{3,8}|\d{3,4}[-|——|—]\d{7,8}转\d{1,4}|\d{3,4}[-|——|—]\d{7,8}|[\（|\(]0\d{2,3}[\）|\)]\d{7,8}') # 联系电话 
        while(rows):
            for row in rows:            
                item_x = embedding(spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2]))
                s = spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=15)
                s2 = ''.join(s[1])
                s2 = re.sub('，）', '-', s2)
                s2 = re.sub('\s','',s2) 
                have_key = re.findall(key_word, s2)
                have_phone = re.findall(phone, s2)
                
                if have_phone:
                    dianhua.append(have_phone)
                elif have_key:
                    dianhua.append(have_phone)                   
                else:
                    dianhua.append('')                    
                sen.append(s2)
                ent_id.append(row[4])
                ent_text.append(row[6])
                data_x.append(item_x)
                doc_id.append(row[3])
            rows = cursor.fetchmany(1000)
        cursor.close()
        conn.close()               
        return np.transpose(np.array(data_x),(1,0,2,3)),doc_id,ent_id,sen,ent_text,dianhua
    
    test_x,doc_id,ent_id,sen,ent_text,dianhua = getTokensLabels()
    model = models.load_model("model_person.model",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
    predict_y = model.predict([test_x[0],test_x[1]])
    label = [np.argmax(y) for y in predict_y]
    data = {'doc_id':doc_id, 'ent_id':ent_id, 'sen':sen, 'entity_text':ent_text, 'dianhua':dianhua, 'label':label}
    df = pd.DataFrame(data)
    df.to_excel('data/person_phone.xls')   

    conn = psycopg2.connect(dbname='BidiPro', user='postgres',password='postgres',host='192.168.2.101')
    cursor = conn.cursor()    
    table = 'person_phone_predict'
    cursor.execute(" select to_regclass('"+table+"') is null ")
    notExists = cursor.fetchall()[0][0]
    if notExists:
        cursor.execute(" create table "+table+" (doc_id text,entity_id text,entity text,label int,predict text,phone text)")
    else:
        cursor.execute(" delete from "+table)   
       
    for i in range(len(df['ent_id'])):
        pre_y = [str(a) for a in predict_y[i]]                                                         
        sql = " insert into "+table+"(doc_id,entity_id,entity,label,predict,phone) values('"+str(df['doc_id'][i])+"','"+str(df['ent_id'][i])+"','"+str(df['entity_text'][i])+"',"+str(int(df['label'][i]))+",'"+str(','.join(pre_y))+"','"+str(','.join(df['dianhua'][i]))+"')"
        #print(sql)
        cursor.execute(sql)
    conn.commit()
    print('提交完成')
    cursor.close()
    conn.close()


if __name__ == '__main__':
    #get_data()
    #label_data()
    #post_data()
    training()
    predict()
    # train()