from BiddingKG.dl.common.Utils import * import psycopg2 import pandas as pd import math import glob from Entity2DB import * import BiddingKG.dl.interface.predictor as predictor import BiddingKG.dl.interface.Preprocessing as Preprocessing def getWrongData(): def spanWindow(tokens,begin_index,end_index,size): ''' @summary:取得某个实体的上下文词汇 @param: tokens:句子分词list begin_index:实体的开始index end_index:实体的结束index size:左右两边各取多少个词 @return: list,实体的上下文词汇 ''' length_tokens = len(tokens) if begin_index>size: begin = begin_index-size else: begin = 0 if end_index+size65000: data_same = {"list_entityid":list_same_entityid, "list_before":list_same_before, "list_center":list_same_center, "list_after":list_same_after, "list_label":list_same_label} df = pd.DataFrame(data_same,columns=["list_entityid","list_before","list_center","list_after","list_label"]) df.to_excel("role_same"+str(_index)+".xls") _index += 1 list_same_entityid = [] list_same_before = [] list_same_center = [] list_same_after = [] list_same_label = [] else: if item not in train_set_now: list_notsame_entityid.append(row[0]) list_notsame_before.append(_span[0]) list_notsame_center.append(_span[1]) list_notsame_after.append(_span[2]) list_notsame_label.append(str(_label)) train_set_now.add(item) data_same = {"list_entityid":list_same_entityid, "list_before":list_same_before, "list_center":list_same_center, "list_after":list_same_after, "list_label":list_same_label} df = pd.DataFrame(data_same,columns=["list_entityid","list_before","list_center","list_after","list_label"]) df.to_excel("role_same"+str(_index)+".xls") data_notsame = {"list_entityid":list_notsame_entityid, "list_before":list_notsame_before, "list_center":list_notsame_center, "list_after":list_notsame_after, "list_label":list_notsame_label} df = pd.DataFrame(data_notsame,columns=["list_entityid","list_before","list_center","list_after","list_label"]) df.to_excel("role_notsame.xls") ''' _context_set = set() conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() sql = " select B.entity_id,A.tokens,B.entity_text,B.begin_index,B.end_index,B.label from sentences_selffool A,entity_mention_selffool B where B.entity_type in ('org','company') and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and not exists(select 1 from entity_mention where entity_mention.entity_id=B.entity_id) " cursor.execute(sql) rows = cursor.fetchall() list_notexists_entityid = [] list_notexists_before = [] list_notexists_center = [] list_notexists_after = [] list_notexists_label = [] rows.sort(key=lambda x:x[5]) for row in rows: _span = spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=10,center_include=True,word_flag=True,text=row[2]) _label = row[5] item = (str(_span[0]),str(_span[1]),str(_span[2]),str(_label)) if item not in _context_set: list_notexists_entityid.append(row[0]) list_notexists_before.append(_span[0]) list_notexists_center.append(_span[1]) list_notexists_after.append(_span[2]) list_notexists_label.append(str(_label)) _context_set.add(item) data_notexists = {"list_entityid":list_notexists_entityid, "list_before":list_notexists_before, "list_center":list_notexists_center, "list_after":list_notexists_after, "list_label":list_notexists_label} df = pd.DataFrame(data_notexists,columns=["list_entityid","list_before","list_center","list_after","list_label"]) df.to_excel("role_notexists.xls") def updateTurnLabel(): conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() df = pd.read_excel("批量.xls") for entity_id,label in zip(df["list_entityid"],df["list_label"]): sql = " update turn_label_selffool set new_label="+str(int(label))+" where entity_id='"+str(entity_id)+"' " cursor.execute(sql) conn.commit() conn.close() def importTurnLabel(): conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() df = pd.read_excel("批量notexists.xls") for entity_id,label in zip(df["list_entityid"],df["list_label"]): sql = " insert into turn_label_selffool(entity_id,new_label) values('"+entity_id+"',"+str(int(label))+")" cursor.execute(sql) conn.commit() conn.close() #print(train_set_before) if __name__=="__main__": #getWrongData() #importWrongDataOfRole() #selectWithRule("role_notexists.xls",["批量notexists.xls"],"rule.xls") #importTurnDataOfRole() #dumpData() #generateTrainData() #getDifferenctTrainData() #updateTurnLabel() #importTurnLabel() a = load("id_token_text_begin_end_label.pk1") print(len(a)) b = load("id_token_text_begin_end_label-selffool.pk1") print(len(b))