import psycopg2 import codecs import xlwt import xlrd import os import re from xlutils.copy import copy from BiddingKG.dl.common.Utils import * import pandas as pd import math def getData(t="final_label_role"): ''' ''' conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() select_sql = " select A.doc_id,C.entity_id,C.label,case when C.label=0 then '招标人' when C.label=1 then '招标代理' when C.label=2 then '中标人/第一候选' when C.label=3 then '第二' when C.label=4 then '第三' else '无' end as 再标注,case when D.label=0 then '招标人' when D.label=1 then '招标代理' when D.label=2 then '中标人/第一候选' when D.label=3 then '第二' when D.label=4 then '第三' else '无' end as 原标注,B.entity_text,A.tokens[B.begin_index-10:B.begin_index],A.tokens[B.begin_index+1:B.end_index+1],A.tokens[B.end_index+2:B.end_index+12] " group_sql = " group by A.doc_id,C.entity_id,C.label,D.label,B.entity_text,B.begin_index,B.end_index,A.tokens,A.sentence_index " sql = select_sql+" from sentences A,entity_mention B,"+t+" C,label_guest_role D where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id and C.entity_id=D.entity_id and C.label!=D.label "+group_sql+"order by A.doc_id,A.sentence_index asc,D.label asc" cursor.execute(sql) result = [] rows = cursor.fetchall() for row in rows: item = [] for column in row: item.append(column) result.append(item) conn.close() return result def labeling(datas): ''' @summary:标注数据 @param: datas:待标注数据,包括doc_id,entity_id,标注值,上下文 ''' sum = 0 row_index = 0 begin_doc_id = str(input("开始文章是:")) begin_index = 0 end_index = len(datas)-1 find_flag = False while(row_index0 and (int(sheet.cell_value(row_index,col_flag))>0): sql = " insert into "+table+"(entity_id,label) values('"+str(sheet.cell_value(row_index,col_entity_id))+"',"+str(int(sheet.cell_value(row_index,col_label)))+")" cursor.execute(sql) conn.commit() conn.close() def selectWrongDatasFromExcel(): ''' @summary:取出标注为错误的数据 ''' home = "./label/role_done/" data = [] toExcel_file = "../../dl_dev/role/label/role_done/候选中因序号标错到无.xls" toExcel = xlwt.Workbook() toExcel_sheet = toExcel.add_sheet("错误标注到无",cell_overwrite_ok=True) row_head = ["entity_id","标注id","实体前","实体","实体后","角色","正确?(1-正确,0-错误)"] row_index_toExcel = 0 style = set_style('Times New Roman',220,True) for i in range(len(row_head)): toExcel_sheet.write(row_index_toExcel,i,row_head[i],style) row_index_toExcel += 1 for file in os.listdir(home): if os.path.isfile(home+file): if re.search(re.compile("第[一二三]"),file) is not None: book = xlrd.open_workbook(home+file) sheet = book.sheet_by_index(0) changeBook = copy(book) changeSheet = changeBook.get_sheet(0) for row_index in range(0,sheet.nrows): if re.search(re.compile("排名|排序|名次|第[123一二三]|(中标|成交)(人|单位|供应商)|成交情况"),str(sheet.cell_value(row_index,2))) is None: if re.search(re.compile("序号[::][123]"),str(sheet.cell_value(row_index,4))) is not None: print(file,sheet.row_values(row_index)) changeSheet.write(row_index,6,0) row = sheet.row_values(row_index) row[1] = 5 row[5] = "无" row[6] = 1 for i in range(len(row)): toExcel_sheet.write(row_index_toExcel,i,row[i],style) row_index_toExcel += 1 changeBook.save(home+"".join(file.split(".")[:-1])+"修改序号.xls") toExcel.save(toExcel_file) def exportHandLabelData(): conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() sql = ''' select A.entity_id,A.entity_text,A.begin_index,A.end_index,C.label,B.tokens from entity_mention A,sentences B ,hand_label_role C where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and A.entity_id=C.entity_id and A.entity_type in ('org','company') order by C.label ''' cursor.execute(sql) rows = cursor.fetchall() list_entity_id = [] list_before = [] list_after = [] list_text = [] list_label = [] repeat = set() for row in rows: entity_id = row[0] entity_text = row[1] begin_index = row[2] end_index = row[3] label = int(row[4]) tokens = row[5] beforeafter = spanWindow(tokens, begin_index, end_index, 10) if ("".join(beforeafter[0]),entity_text,"".join(beforeafter[1])) in repeat: continue if str(label)!="5": continue repeat.add(("".join(beforeafter[0]),entity_text,"".join(beforeafter[1]))) list_entity_id.append(entity_id) list_before.append("".join(beforeafter[0])) list_after.append("".join(beforeafter[1])) list_text.append(entity_text) list_label.append(label) print("len",len(list_entity_id)) parts = 1 parts_num = len(list_entity_id)//parts for i in range(parts-1): data = {"entity_id":list_entity_id[i*parts_num:(i+1)*parts_num],"list_before":list_before[i*parts_num:(i+1)*parts_num],"list_after":list_after[i*parts_num:(i+1)*parts_num],"list_text":list_text[i*parts_num:(i+1)*parts_num],"list_label":list_label[i*parts_num:(i+1)*parts_num]} df = pd.DataFrame(data) df.to_excel("原先标注数据_role_"+str(i)+".xls",columns=["entity_id","list_before","list_text","list_after","list_label"]) i = parts - 1 data = {"entity_id":list_entity_id[i*parts_num:],"list_before":list_before[i*parts_num:],"list_after":list_after[i*parts_num:],"list_text":list_text[i*parts_num:],"list_label":list_label[i*parts_num:]} df = pd.DataFrame(data) df.to_excel("角色无数据_role_"+str(i)+".xls",columns=["entity_id","list_before","list_text","list_after","list_label"]) def selectWithRule(source,filter,target): assert target not in filter assert source!=target dict_source = pd.read_excel(source) set_filter = set() for filt in filter: set_filter = set_filter | set(pd.read_excel(filt)["entity_id"]) list_entity_id = [] list_before = [] list_text = [] list_after = [] list_label = [] for id,before,text,after,label in zip(dict_source["entity_id"],dict_source["list_before"],dict_source["list_text"],dict_source["list_after"],dict_source["list_label"]): if id in set_filter: continue if re.search("",str(before)) is not None: list_entity_id.append(id) list_before.append(before) list_text.append(text) list_after.append(after) list_label.append(label) data = {"entity_id":list_entity_id,"list_before":list_before,"list_text":list_text,"list_after":list_after,"list_label":list_label} columns = ["entity_id","list_before","list_text","list_after","list_label"] df = pd.DataFrame(data) df.to_excel(target,index=False,columns=columns) def importreHandLabelData(): conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() table = "hand_label_role_0409" files = ["待检测原先标注数据_role_11.xls","待检查原先标注数据_role_00.xls","批量.xls"] for file in files: df = pd.read_excel(file) for entity_id,label,turn in zip(df["entity_id"],df["list_label"],df["turn"]): new_label = label #print(entity_id) if not math.isnan(turn): new_label = turn sql = " insert into "+table+"(entity_id,label) values('"+entity_id+"',"+str(new_label)+") " cursor.execute(sql) conn.commit() conn.close() def dumpData(): conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() sql = " select B.entity_id,A.tokens,B.entity_text,B.begin_index,B.end_index,C.label from sentences A,entity_mention_copy B,hand_label_role_0409 C where B.entity_type in ('org','company') and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id " cursor.execute(sql) rows = cursor.fetchall() save(rows,"id_token_text_begin_end_label.pk") conn.close() def relabel(): pkfiles = ["id_token_text_begin_end_label.pk","id_token_text_begin_end_label.pk1"] list_id = [] list_before = [] list_text = [] list_after = [] list_label = [] for file in pkfiles: for row in load(file): id = row[0] token = row[1] text = row[2] begin = int(row[3]) end = int(row[4]) label = int(row[5]) span = spanWindow(token, begin, end, size=10, center_include=True, word_flag=True) before = span[0] center = span[1] after = span[2] if re.search("中标人.{,3}$",before) is not None: list_id.append(id) list_before.append(before) list_text.append(center) list_after.append(after) list_label.append(label) df = pd.DataFrame({"list_id":list_id,"list_before":list_before,"list_text":list_text,"list_after":list_after,"list_label":list_label}) df.to_excel("rule1.xls",columns=["list_id","list_before","list_text","list_after","list_label"],index=False) def importAfterrelabel(): conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() conn_1 = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101") cursor_1 = conn_1.cursor() df = pd.read_excel("rule1.xls") list_id = df["list_id"] list_label = df["list_label"] count = 0 for id,label in zip(list_id,list_label): if re.search("比地",id) is not None: sql = " update turn_label set new_label='"+str(int(label))+"' where entity_id='"+id+"' " cursor_1.execute(sql) else: sql = " update hand_label_role_0409 set label="+str(int(label))+" where entity_id='"+id+"' " cursor.execute(sql) count += 1 print("done",count) conn.commit() conn_1.commit() conn.close() conn_1.close() if __name__=="__main__": pass #labeling() #getDatasToExcel() #getDatasFromExcel() #selectWrongDatasFromExcel() #exportHandLabelData() #selectWithRule("角色无数据_role_0.xls",["批量.xls"],"公告公示.xls") #importreHandLabelData() #dumpData() #relabel() #importAfterrelabel()