123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384 |
- from BiddingKG.dl.common.Utils import *
- import psycopg2
- import pandas as pd
- import math
- import glob
- from Entity2DB import *
- import BiddingKG.dl.interface.predictor as predictor
- import BiddingKG.dl.interface.Preprocessing as Preprocessing
- def getWrongData():
-
- def spanWindow(tokens,begin_index,end_index,size):
- '''
- @summary:取得某个实体的上下文词汇
- @param:
- tokens:句子分词list
- begin_index:实体的开始index
- end_index:实体的结束index
- size:左右两边各取多少个词
- @return: list,实体的上下文词汇
- '''
- length_tokens = len(tokens)
- if begin_index>size:
- begin = begin_index-size
- else:
- begin = 0
- if end_index+size<length_tokens:
- end = end_index+size+1
- else:
- end = length_tokens
- result = []
- result.append(tokens[begin:begin_index])
- result.append(tokens[begin_index:end_index+1])
- result.append(tokens[end_index+1:end])
- #print(result)
- return result
-
- files = []
- for file in glob.glob("C:\\Users\\User\\Desktop\\20190416要素\\*.html"):
- filename = file.split("\\")[-1]
- files.append(filename)
-
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- sql = '''
- select A.entity_id,A.entity_text,A.begin_index,A.end_index,A.label,A.values,B.tokens,A.doc_id
- from entity_mention A,sentences B
- where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index
- and A.entity_type in ('org','company')
- and A.handlabel='1'
- and A.label!='None'
- and not exists(select 1 from turn_label where entity_id=A.entity_id)
- order by A.label
- '''
-
- cursor.execute(sql)
- rows = cursor.fetchall()
-
- list_entity_id = []
- list_before = []
- list_after = []
- list_text = []
- list_label = []
- list_prob = []
- repeat = set()
- for row in rows:
- entity_id = row[0]
- #entity_text = row[1]
- begin_index = row[2]
- end_index = row[3]
- label = int(row[4])
- values = row[5][1:-1].split(",")
- tokens = row[6]
- doc_id = row[7]
-
- if doc_id not in files:
- continue
-
- if float(values[label])<0.5:
- continue
-
- beforeafter = spanWindow(tokens, begin_index, end_index, 10)
-
- if ("".join(beforeafter[0]),"".join(beforeafter[1]),"".join(beforeafter[2])) in repeat:
- continue
-
- repeat.add(("".join(beforeafter[0]),"".join(beforeafter[1]),"".join(beforeafter[2])))
-
- list_entity_id.append(entity_id)
- list_before.append("".join(beforeafter[0]))
- list_after.append("".join(beforeafter[2]))
- list_text.append("".join(beforeafter[1]))
- list_label.append(label)
- list_prob.append(values[label])
- print("len",len(list_entity_id))
- parts = 1
- parts_num = len(list_entity_id)//parts
- for i in range(parts-1):
-
- data = {"entity_id":list_entity_id[i*parts_num:(i+1)*parts_num],"list_before":list_before[i*parts_num:(i+1)*parts_num],"list_after":list_after[i*parts_num:(i+1)*parts_num],"list_text":list_text[i*parts_num:(i+1)*parts_num],"list_label":list_label[i*parts_num:(i+1)*parts_num],"list_prob":list_prob[i*parts_num:(i+1)*parts_num]}
- df = pd.DataFrame(data)
- df.to_excel("未标注错误_"+str(i)+".xls",columns=["entity_id","list_before","list_text","list_after","list_label","list_prob"])
- i = parts - 1
- data = {"entity_id":list_entity_id[i*parts_num:],"list_before":list_before[i*parts_num:],"list_after":list_after[i*parts_num:],"list_text":list_text[i*parts_num:],"list_label":list_label[i*parts_num:],"list_prob":list_prob[i*parts_num:]}
- df = pd.DataFrame(data)
- df.to_excel("测试数据_role1"+str(i)+".xls",columns=["entity_id","list_before","list_text","list_after","list_label","list_prob"])
-
- def importWrongDataOfRole():
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- parts = 2
- for i in range(parts):
- file = "wrong_role_"+str(i)+".xls"
- df = pd.read_excel(file)
- for entity_id,old_label,new_label in zip(df["entity_id"],df["list_label"],df["turn"]):
- '''
- if math.isnan(new_label):
- print(entity_id)
- '''
- sql = " insert into turn_label(entity_id,old_label,new_label) values('"+entity_id+"','"+str(int(old_label))+"','"+str(int(new_label))+"')"
- cursor.execute(sql)
-
- conn.commit()
- conn.close()
-
- def importTurnDataOfRole():
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- for file in glob.glob("done/*.xls"):
- if re.search("wrong",file) is not None:
- continue
- df = pd.read_excel(file)
- for entity_id,old_label,new_label in zip(df["entity_id"],df["list_label"],df["turn"]):
- ''''''
- if math.isnan(new_label):
- new_label = old_label
- print(entity_id)
- sql = " insert into turn_label(entity_id,old_label,new_label) values('"+entity_id+"','"+str(int(old_label))+"','"+str(int(new_label))+"')"
- cursor.execute(sql)
-
- conn.commit()
- conn.close()
-
- def selectWithRule(source,filter,target):
- assert source!=target
- dict_source = pd.read_excel(source)
- set_filter = set()
- for filt in filter:
- set_filter = set_filter | set(pd.read_excel(filt)["list_entityid"])
-
- list_entity_id = []
- list_before = []
- list_text = []
- list_after = []
- list_label = []
-
- selectdata = []
- for id,before,text,after,label in zip(dict_source["list_entityid"],dict_source["list_before"],dict_source["list_center"],dict_source["list_after"],dict_source["list_label"]):
- if id in set_filter:
- continue
- if re.search("",str(before)) is not None:
- selectdata.append([id,before,text,after,label])
-
- selectdata.sort(key=lambda x:x[4])
- for item in selectdata:
- list_entity_id.append(item[0])
- list_before.append(item[1])
- list_text.append(item[2])
- list_after.append(item[3])
- list_label.append(item[4])
-
- data = {"list_entityid":list_entity_id,"list_before":list_before,"list_text":list_text,"list_after":list_after,"list_label":list_label}
- columns = ["list_entityid","list_before","list_text","list_after","list_label"]
-
- df = pd.DataFrame(data)
- df.to_excel(target,index=False,columns=columns)
-
- def dumpData():
- files = []
- for file in glob.glob("C:\\Users\\User\\Desktop\\20190416要素\\*.html"):
- filename = file.split("\\")[-1]
- files.append(filename)
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- sql = " select B.entity_id,A.tokens,B.entity_text,B.begin_index,B.end_index,C.new_label from sentences_selffool A,entity_mention_selffool B,turn_label_selffool C where B.entity_type in ('org','company') and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id "
- cursor.execute(sql)
-
- rows = cursor.fetchall()
- data = []
- for row in rows:
- if "_".join(row[0].split("_")[:-3]) not in files:
- data.append(row)
- save(data,"id_token_text_begin_end_label-selffool.pk1")
- conn.close()
-
- def generateTrainData():
- codeNamePredict = predictor.CodeNamePredict()
- premPredict = predictor.PREMPredict()
- epcPredict = predictor.EPCPredict()
- roleRulePredict = predictor.RoleRulePredictor()
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- sql = " select id,content from articles_processed"
- cursor.execute(sql)
- data = cursor.fetchall()
- _count = 0
- for row in data[_count:]:
- try:
- _count += 1
- print(_count,len(data))
- doc_id = row[0]
- text = row[1]
- list_articles,list_sentences,list_entitys = Preprocessing.get_preprocessed([[doc_id,text]],useselffool=True)
- codeName = codeNamePredict.predict(list_articles)
- premPredict.predict(list_sentences,list_entitys)
- roleRulePredict.predict(list_sentences, list_entitys,codeName[0][1]["name"])
- epcPredict.predict(list_sentences,list_entitys)
- persistArticle(conn, list_articles,codeName)
- for sentences in list_sentences:
- persistSentence(conn, sentences)
- for entitys in list_entitys:
- persistEntity(conn, entitys)
- conn.commit()
- except Exception as e:
- print(doc_id,str(e))
- conn.close()
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- conn.close()
-
- def getDifferenctTrainData():
- files = ["id_token_text_begin_end_label.pk1"]
- train_set_before = set()
- for file in files:
- data = load(file)
- for row in data:
- _span = spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=10,center_include=True,word_flag=True,text=row[2])
- _label = row[5]
- item = (str(_span[0]),str(_span[1]),str(_span[2]),str(_label))
- train_set_before.add(item)
-
- '''
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- sql = " select B.entity_id,A.tokens,B.entity_text,B.begin_index,B.end_index,B.label from sentences_selffool A,entity_mention_selffool B,turn_label C where B.entity_type in ('org','company') and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id "
- cursor.execute(sql)
- rows = cursor.fetchall()
-
- list_same_entityid = []
- list_same_before = []
- list_same_center = []
- list_same_after = []
- list_same_label = []
-
- list_notsame_entityid = []
- list_notsame_before = []
- list_notsame_center = []
- list_notsame_after = []
- list_notsame_label = []
-
- train_set_now = set()
- _index = 0
- for row in rows:
- _span = spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=10,center_include=True,word_flag=True,text=row[2])
- _label = row[5]
- item = (str(_span[0]),str(_span[1]),str(_span[2]),str(_label))
- if item in train_set_before:
- list_same_entityid.append(row[0])
- list_same_before.append(_span[0])
- list_same_center.append(_span[1])
- list_same_after.append(_span[2])
- list_same_label.append(str(_label))
- if len(list_same_entityid)>65000:
- data_same = {"list_entityid":list_same_entityid,
- "list_before":list_same_before,
- "list_center":list_same_center,
- "list_after":list_same_after,
- "list_label":list_same_label}
- df = pd.DataFrame(data_same,columns=["list_entityid","list_before","list_center","list_after","list_label"])
- df.to_excel("role_same"+str(_index)+".xls")
- _index += 1
- list_same_entityid = []
- list_same_before = []
- list_same_center = []
- list_same_after = []
- list_same_label = []
-
- else:
- if item not in train_set_now:
- list_notsame_entityid.append(row[0])
- list_notsame_before.append(_span[0])
- list_notsame_center.append(_span[1])
- list_notsame_after.append(_span[2])
- list_notsame_label.append(str(_label))
- train_set_now.add(item)
- data_same = {"list_entityid":list_same_entityid,
- "list_before":list_same_before,
- "list_center":list_same_center,
- "list_after":list_same_after,
- "list_label":list_same_label}
-
- df = pd.DataFrame(data_same,columns=["list_entityid","list_before","list_center","list_after","list_label"])
- df.to_excel("role_same"+str(_index)+".xls")
-
- data_notsame = {"list_entityid":list_notsame_entityid,
- "list_before":list_notsame_before,
- "list_center":list_notsame_center,
- "list_after":list_notsame_after,
- "list_label":list_notsame_label}
- df = pd.DataFrame(data_notsame,columns=["list_entityid","list_before","list_center","list_after","list_label"])
- df.to_excel("role_notsame.xls")
- '''
- _context_set = set()
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- sql = " select B.entity_id,A.tokens,B.entity_text,B.begin_index,B.end_index,B.label from sentences_selffool A,entity_mention_selffool B where B.entity_type in ('org','company') and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and not exists(select 1 from entity_mention where entity_mention.entity_id=B.entity_id) "
- cursor.execute(sql)
- rows = cursor.fetchall()
- list_notexists_entityid = []
- list_notexists_before = []
- list_notexists_center = []
- list_notexists_after = []
- list_notexists_label = []
- rows.sort(key=lambda x:x[5])
- for row in rows:
- _span = spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=10,center_include=True,word_flag=True,text=row[2])
- _label = row[5]
- item = (str(_span[0]),str(_span[1]),str(_span[2]),str(_label))
- if item not in _context_set:
- list_notexists_entityid.append(row[0])
- list_notexists_before.append(_span[0])
- list_notexists_center.append(_span[1])
- list_notexists_after.append(_span[2])
- list_notexists_label.append(str(_label))
- _context_set.add(item)
-
- data_notexists = {"list_entityid":list_notexists_entityid,
- "list_before":list_notexists_before,
- "list_center":list_notexists_center,
- "list_after":list_notexists_after,
- "list_label":list_notexists_label}
- df = pd.DataFrame(data_notexists,columns=["list_entityid","list_before","list_center","list_after","list_label"])
- df.to_excel("role_notexists.xls")
-
- def updateTurnLabel():
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- df = pd.read_excel("批量.xls")
- for entity_id,label in zip(df["list_entityid"],df["list_label"]):
- sql = " update turn_label_selffool set new_label="+str(int(label))+" where entity_id='"+str(entity_id)+"' "
- cursor.execute(sql)
- conn.commit()
- conn.close()
-
- def importTurnLabel():
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- df = pd.read_excel("批量notexists.xls")
- for entity_id,label in zip(df["list_entityid"],df["list_label"]):
- sql = " insert into turn_label_selffool(entity_id,new_label) values('"+entity_id+"',"+str(int(label))+")"
- cursor.execute(sql)
- conn.commit()
- conn.close()
-
- #print(train_set_before)
- if __name__=="__main__":
- #getWrongData()
- #importWrongDataOfRole()
- #selectWithRule("role_notexists.xls",["批量notexists.xls"],"rule.xls")
- #importTurnDataOfRole()
- #dumpData()
- #generateTrainData()
- #getDifferenctTrainData()
- #updateTurnLabel()
- #importTurnLabel()
- a = load("id_token_text_begin_end_label.pk1")
- print(len(a))
- b = load("id_token_text_begin_end_label-selffool.pk1")
- print(len(b))
|