luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
							
from BiddingKG.dl.common.Utils import *

import psycopg2
import pandas as pd
import math
import glob
from Entity2DB import *
import BiddingKG.dl.interface.predictor as predictor
import BiddingKG.dl.interface.Preprocessing as Preprocessing

def getWrongData():
    
    def spanWindow(tokens,begin_index,end_index,size):
        '''
        @summary:取得某个实体的上下文词汇
        @param:
            tokens:句子分词list
            begin_index:实体的开始index
            end_index:实体的结束index
            size:左右两边各取多少个词
        @return: list,实体的上下文词汇
        '''  
        length_tokens = len(tokens)
        if begin_index>size:
            begin = begin_index-size
        else:
            begin = 0
        if end_index+size<length_tokens:
            end = end_index+size+1
        else:
            end = length_tokens
        result = []
        result.append(tokens[begin:begin_index])
        result.append(tokens[begin_index:end_index+1])
        result.append(tokens[end_index+1:end])
        #print(result)
        return result
    
    files = []
    for file in glob.glob("C:\\Users\\User\\Desktop\\20190416要素\\*.html"):
        filename = file.split("\\")[-1]
        files.append(filename)
    
    conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    
    sql = '''
     select A.entity_id,A.entity_text,A.begin_index,A.end_index,A.label,A.values,B.tokens,A.doc_id
     from entity_mention A,sentences B 
     where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index
     and A.entity_type in ('org','company')
     and A.handlabel='1'
     and A.label!='None'
     and not exists(select 1 from turn_label where entity_id=A.entity_id)
      order by A.label 
    '''
    
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    list_entity_id = []
    list_before = []
    list_after = []
    list_text = []
    list_label = []
    list_prob = []
    repeat = set()
    for row in rows:
        entity_id = row[0]
        #entity_text = row[1]
        begin_index = row[2]
        end_index = row[3]
        label = int(row[4])
        values = row[5][1:-1].split(",")
        tokens = row[6]
        doc_id = row[7]
        
        if doc_id not in files:
            continue
        
        if float(values[label])<0.5:
            continue
        
        beforeafter = spanWindow(tokens, begin_index, end_index, 10)
        
        if ("".join(beforeafter[0]),"".join(beforeafter[1]),"".join(beforeafter[2])) in repeat:
            continue
        
        repeat.add(("".join(beforeafter[0]),"".join(beforeafter[1]),"".join(beforeafter[2])))
        
        list_entity_id.append(entity_id)
        list_before.append("".join(beforeafter[0]))
        list_after.append("".join(beforeafter[2]))
        list_text.append("".join(beforeafter[1]))
        list_label.append(label)
        list_prob.append(values[label])
    print("len",len(list_entity_id))
    parts = 1
    parts_num = len(list_entity_id)//parts
    for i in range(parts-1):
        
        data = {"entity_id":list_entity_id[i*parts_num:(i+1)*parts_num],"list_before":list_before[i*parts_num:(i+1)*parts_num],"list_after":list_after[i*parts_num:(i+1)*parts_num],"list_text":list_text[i*parts_num:(i+1)*parts_num],"list_label":list_label[i*parts_num:(i+1)*parts_num],"list_prob":list_prob[i*parts_num:(i+1)*parts_num]}
        df = pd.DataFrame(data)
        df.to_excel("未标注错误_"+str(i)+".xls",columns=["entity_id","list_before","list_text","list_after","list_label","list_prob"])
    i = parts - 1
    data = {"entity_id":list_entity_id[i*parts_num:],"list_before":list_before[i*parts_num:],"list_after":list_after[i*parts_num:],"list_text":list_text[i*parts_num:],"list_label":list_label[i*parts_num:],"list_prob":list_prob[i*parts_num:]}
    df = pd.DataFrame(data)
    df.to_excel("测试数据_role1"+str(i)+".xls",columns=["entity_id","list_before","list_text","list_after","list_label","list_prob"])
    
def importWrongDataOfRole():
    conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    parts = 2
    for i in range(parts):
        file = "wrong_role_"+str(i)+".xls"
        df = pd.read_excel(file)
        for entity_id,old_label,new_label in zip(df["entity_id"],df["list_label"],df["turn"]):
            '''
            if math.isnan(new_label):
                print(entity_id)
            '''
            sql = " insert into turn_label(entity_id,old_label,new_label) values('"+entity_id+"','"+str(int(old_label))+"','"+str(int(new_label))+"')"
            cursor.execute(sql)
            
    conn.commit()
    conn.close()
    
def importTurnDataOfRole():
    conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    for file in glob.glob("done/*.xls"):
        if re.search("wrong",file) is not None:
            continue
        df = pd.read_excel(file)
        for entity_id,old_label,new_label in zip(df["entity_id"],df["list_label"],df["turn"]):
            ''''''
            if math.isnan(new_label):
                new_label = old_label
                print(entity_id)
            sql = " insert into turn_label(entity_id,old_label,new_label) values('"+entity_id+"','"+str(int(old_label))+"','"+str(int(new_label))+"')"
            cursor.execute(sql)
            
    conn.commit()
    conn.close()
    
def selectWithRule(source,filter,target):
    assert source!=target
    dict_source = pd.read_excel(source)
    set_filter = set()
    for filt in filter:
        set_filter = set_filter | set(pd.read_excel(filt)["list_entityid"])
    
    list_entity_id = []
    list_before = []
    list_text = []
    list_after = []
    list_label = []
    
    selectdata = []
    for id,before,text,after,label in zip(dict_source["list_entityid"],dict_source["list_before"],dict_source["list_center"],dict_source["list_after"],dict_source["list_label"]):
        if id in set_filter:
            continue
        if re.search("",str(before)) is not None:
            selectdata.append([id,before,text,after,label])
    
    selectdata.sort(key=lambda x:x[4])
    for item in selectdata:
        list_entity_id.append(item[0])
        list_before.append(item[1])
        list_text.append(item[2])
        list_after.append(item[3])
        list_label.append(item[4])
        
    data = {"list_entityid":list_entity_id,"list_before":list_before,"list_text":list_text,"list_after":list_after,"list_label":list_label}
    columns = ["list_entityid","list_before","list_text","list_after","list_label"]
    
    df = pd.DataFrame(data)
    df.to_excel(target,index=False,columns=columns)
    
def dumpData():
    files = []
    for file in glob.glob("C:\\Users\\User\\Desktop\\20190416要素\\*.html"):
        filename = file.split("\\")[-1]
        files.append(filename)
    conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    
    sql = " select B.entity_id,A.tokens,B.entity_text,B.begin_index,B.end_index,C.new_label from sentences_selffool A,entity_mention_selffool B,turn_label_selffool C where B.entity_type in ('org','company') and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id "
    cursor.execute(sql)
    
    rows = cursor.fetchall()
    data = []
    for row in rows:
        if "_".join(row[0].split("_")[:-3]) not in files:
            data.append(row)
    save(data,"id_token_text_begin_end_label-selffool.pk1")
    conn.close()
    
def generateTrainData():
    codeNamePredict = predictor.CodeNamePredict()
    premPredict = predictor.PREMPredict()
    epcPredict = predictor.EPCPredict()
    roleRulePredict = predictor.RoleRulePredictor()
    conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    sql = " select id,content from articles_processed"
    cursor.execute(sql)
    data = cursor.fetchall()
    _count = 0
    for row in data[_count:]:
        try:
            _count += 1
            print(_count,len(data))
            doc_id = row[0]
            text = row[1]
            list_articles,list_sentences,list_entitys = Preprocessing.get_preprocessed([[doc_id,text]],useselffool=True)
            codeName = codeNamePredict.predict(list_articles)
            premPredict.predict(list_sentences,list_entitys)
            roleRulePredict.predict(list_sentences, list_entitys,codeName[0][1]["name"])
            epcPredict.predict(list_sentences,list_entitys)
            persistArticle(conn, list_articles,codeName)
            for sentences in list_sentences:
                persistSentence(conn, sentences)
            for entitys in list_entitys:
                persistEntity(conn, entitys)
            conn.commit()
        except Exception as e:
            print(doc_id,str(e))
            conn.close()
            conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
            cursor = conn.cursor()
    conn.close() 
    
def getDifferenctTrainData():
    files = ["id_token_text_begin_end_label.pk1"]
    train_set_before = set()
    for file in files:
        data = load(file)
        for row in data:
            _span = spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=10,center_include=True,word_flag=True,text=row[2])
            _label = row[5]
            item = (str(_span[0]),str(_span[1]),str(_span[2]),str(_label))
            train_set_before.add(item)
    
    '''
    conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    sql = " select B.entity_id,A.tokens,B.entity_text,B.begin_index,B.end_index,B.label from sentences_selffool A,entity_mention_selffool B,turn_label C where B.entity_type in ('org','company') and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id "
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    list_same_entityid = []
    list_same_before = []
    list_same_center = []
    list_same_after = []
    list_same_label = []
    
    list_notsame_entityid = []
    list_notsame_before = []
    list_notsame_center = []
    list_notsame_after = []
    list_notsame_label = []
    
    train_set_now = set()
    _index = 0
    for row in rows:
        _span = spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=10,center_include=True,word_flag=True,text=row[2])
        _label = row[5]
        item = (str(_span[0]),str(_span[1]),str(_span[2]),str(_label))
        if item in train_set_before:
            list_same_entityid.append(row[0])
            list_same_before.append(_span[0])
            list_same_center.append(_span[1])
            list_same_after.append(_span[2])
            list_same_label.append(str(_label))
            if len(list_same_entityid)>65000:
                data_same = {"list_entityid":list_same_entityid,
                             "list_before":list_same_before,
                             "list_center":list_same_center,
                             "list_after":list_same_after,
                             "list_label":list_same_label}
                df = pd.DataFrame(data_same,columns=["list_entityid","list_before","list_center","list_after","list_label"])
                df.to_excel("role_same"+str(_index)+".xls")
                _index += 1
                list_same_entityid = []
                list_same_before = []
                list_same_center = []
                list_same_after = []
                list_same_label = []
                
        else:
            if item not in train_set_now:
                list_notsame_entityid.append(row[0])
                list_notsame_before.append(_span[0])
                list_notsame_center.append(_span[1])
                list_notsame_after.append(_span[2])
                list_notsame_label.append(str(_label))
                train_set_now.add(item)
    data_same = {"list_entityid":list_same_entityid,
                 "list_before":list_same_before,
                 "list_center":list_same_center,
                 "list_after":list_same_after,
                 "list_label":list_same_label}
    
    df = pd.DataFrame(data_same,columns=["list_entityid","list_before","list_center","list_after","list_label"])
    df.to_excel("role_same"+str(_index)+".xls")

    
    data_notsame = {"list_entityid":list_notsame_entityid,
                     "list_before":list_notsame_before,
                     "list_center":list_notsame_center,
                     "list_after":list_notsame_after,
                     "list_label":list_notsame_label}
    df = pd.DataFrame(data_notsame,columns=["list_entityid","list_before","list_center","list_after","list_label"])
    df.to_excel("role_notsame.xls")
    '''
    _context_set = set()
    conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    sql = " select B.entity_id,A.tokens,B.entity_text,B.begin_index,B.end_index,B.label from sentences_selffool A,entity_mention_selffool B where B.entity_type in ('org','company') and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and not exists(select 1 from entity_mention where entity_mention.entity_id=B.entity_id) "
    cursor.execute(sql)
    rows = cursor.fetchall()
    list_notexists_entityid = []
    list_notexists_before = []
    list_notexists_center = []
    list_notexists_after = []
    list_notexists_label = []
    rows.sort(key=lambda x:x[5])
    for row in rows:
        _span = spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=10,center_include=True,word_flag=True,text=row[2])
        _label = row[5]
        item = (str(_span[0]),str(_span[1]),str(_span[2]),str(_label))
        if item not in _context_set:
            list_notexists_entityid.append(row[0])
            list_notexists_before.append(_span[0])
            list_notexists_center.append(_span[1])
            list_notexists_after.append(_span[2])
            list_notexists_label.append(str(_label))
            _context_set.add(item)
            
    data_notexists = {"list_entityid":list_notexists_entityid,
                     "list_before":list_notexists_before,
                     "list_center":list_notexists_center,
                     "list_after":list_notexists_after,
                     "list_label":list_notexists_label}
    df = pd.DataFrame(data_notexists,columns=["list_entityid","list_before","list_center","list_after","list_label"])
    df.to_excel("role_notexists.xls")
    
def updateTurnLabel():
    conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    df = pd.read_excel("批量.xls")
    for entity_id,label in zip(df["list_entityid"],df["list_label"]):
        sql = " update turn_label_selffool set new_label="+str(int(label))+" where entity_id='"+str(entity_id)+"' "
        cursor.execute(sql)
    conn.commit()
    conn.close()
    
def importTurnLabel():
    conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    df = pd.read_excel("批量notexists.xls")
    for entity_id,label in zip(df["list_entityid"],df["list_label"]):
        sql = " insert into turn_label_selffool(entity_id,new_label) values('"+entity_id+"',"+str(int(label))+")"
        cursor.execute(sql)
    conn.commit()
    conn.close()
        
    #print(train_set_before)
if __name__=="__main__":
    #getWrongData()
    #importWrongDataOfRole()
    #selectWithRule("role_notexists.xls",["批量notexists.xls"],"rule.xls")
    #importTurnDataOfRole()
    #dumpData()
    #generateTrainData()
    #getDifferenctTrainData()
    #updateTurnLabel()
    #importTurnLabel()
    a = load("id_token_text_begin_end_label.pk1")
    print(len(a))
    b = load("id_token_text_begin_end_label-selffool.pk1")
    print(len(b))