123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422 |
- import psycopg2
- import codecs
- import xlwt
- import xlrd
- import os
- import re
- from xlutils.copy import copy
- from BiddingKG.dl.common.Utils import *
- import pandas as pd
- import math
- def getData(t="final_label_role"):
- '''
-
- '''
- conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- select_sql = " select A.doc_id,C.entity_id,C.label,case when C.label=0 then '招标人' when C.label=1 then '招标代理' when C.label=2 then '中标人/第一候选' when C.label=3 then '第二' when C.label=4 then '第三' else '无' end as 再标注,case when D.label=0 then '招标人' when D.label=1 then '招标代理' when D.label=2 then '中标人/第一候选' when D.label=3 then '第二' when D.label=4 then '第三' else '无' end as 原标注,B.entity_text,A.tokens[B.begin_index-10:B.begin_index],A.tokens[B.begin_index+1:B.end_index+1],A.tokens[B.end_index+2:B.end_index+12] "
-
- group_sql = " group by A.doc_id,C.entity_id,C.label,D.label,B.entity_text,B.begin_index,B.end_index,A.tokens,A.sentence_index "
-
- sql = select_sql+" from sentences A,entity_mention B,"+t+" C,label_guest_role D where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id and C.entity_id=D.entity_id and C.label!=D.label "+group_sql+"order by A.doc_id,A.sentence_index asc,D.label asc"
- cursor.execute(sql)
-
- result = []
- rows = cursor.fetchall()
- for row in rows:
- item = []
- for column in row:
- item.append(column)
- result.append(item)
- conn.close()
- return result
- def labeling(datas):
- '''
- @summary:标注数据
- @param:
- datas:待标注数据,包括doc_id,entity_id,标注值,上下文
- '''
- sum = 0
- row_index = 0
- begin_doc_id = str(input("开始文章是:"))
- begin_index = 0
- end_index = len(datas)-1
- find_flag = False
- while(row_index<len(datas)):
- row = datas[row_index]
- if begin_doc_id!="" and begin_index==0:
- if row[0]==begin_doc_id:
- begin_index = row_index
- else:
- row_index += 1
- continue
- find_flag = True
- print(row[0])
- print(row[3],row[4],row[5])
- print("before",row[6])
- print("entity",row[7])
- print("after",row[8])
- while(True):
- l = str(input("标签为:"))
- if l in ["0","1","2","3","4","5","","8","9"]:
- break
- if l=="0":
- row[2] = 0
- elif l=="1":
- row[2] = 1
- elif l=="2":
- row[2] = 2
- elif l=="3":
- row[2] = 3
- elif l=="4":
- row[2] = 4
- elif l=="5":
- row[2] = 5
- elif l=="":
- pass
- elif l=="8":
- row_index -= 1
- sum -= 1
- continue
- elif l=="9":
- end_index = row_index-1
- break
-
- sum += 1
- row_index += 1
- print("sum:",sum)
- if find_flag:
- with codecs.open("relabel.txt","a",encoding="utf8") as f:
- for row in datas[begin_index:end_index+1]:
- f.write(str(row[1]))
- f.write("\t")
- f.write(str(row[2]))
- f.write("\n")
- f.flush()
- f.close()
- #设置表格样式
- def set_style(name,height,bold=False):
- style = xlwt.XFStyle()
- font = xlwt.Font()
- font.name = name
- font.bold = bold
- font.color_index = 4
- font.height = height
- style.font = font
- return style
- def getDatasToExcel():
- '''
- @summary:取出待标注数据到excel中
- '''
- roles = ["0_招标人","1_招标代理","2_中标第一候选","3_第二候选","4_第三候选","5_无"]
- conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- nums = 3
-
- for role in roles:
-
-
- select_sql = " select C.entity_id,C.label,A.tokens[B.begin_index-10:B.begin_index],A.tokens[B.begin_index+1:B.end_index+1],A.tokens[B.end_index+2:B.end_index+12],case when C.label=0 then '招标人' when C.label=1 then '招标代理' when C.label=2 then '中标人/第一候选' when C.label=3 then '第二候选' when C.label=4 then '第三候选' else '无' end as 再标注 "
-
- sql = select_sql+" from sentences A,entity_mention B,final_label_role C where C.label="+role.split("_")[0]+" and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id and not exists(select 1 from relabel where C.entity_id=relabel.entity_id) order by C.label asc,A.doc_id,A.sentence_index asc limit 20000"
-
- print(sql)
- cursor.execute(sql)
-
- rows = cursor.fetchall()
-
- parts = len(rows)//3
-
- for nums_i in range(nums):
- file = xlwt.Workbook()
- sheet = file.add_sheet("标注"+role,cell_overwrite_ok=True)
- row_head = ["entity_id","标注id","实体前","实体","实体后","角色","正确?(1-正确,0-错误)"]
- row_index = 0
- style = set_style('Times New Roman',220,True)
- for i in range(len(row_head)):
- sheet.write(row_index,i,row_head[i],style)
-
- row_index += 1
- if nums_i<nums-1:
-
- for row in rows[nums_i*parts:(nums_i+1)*parts]:
- for i in range(len(row)):
- sheet.write(row_index,i,row[i],style)
- row_index += 1
-
- else:
- for row in rows[nums_i*parts:]:
- for i in range(len(row)):
- sheet.write(row_index,i,row[i],style)
- row_index += 1
-
- file.save("标注"+role.split("_")[1]+str(nums_i)+".xls")
- conn.close()
-
- def getDatasFromExcel():
- '''
- @summary:从已经标注的excel中取出标注数据
- '''
- home = "./label/role_done/"
- col_entity_id = 0
- col_label = 1
- col_flag = 6
- table = "hand_label_role"
- conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- cursor.execute(" select to_regclass('"+table+"') is null ")
- notExists = cursor.fetchall()[0][0]
- if notExists:
- cursor.execute(" create table "+table+" (entity_id text,label int)")
- else:
- cursor.execute(" delete from "+table)
- conn.commit()
-
- for file in os.listdir(home):
- if os.path.isfile(home+file):
- book = xlrd.open_workbook(home+file)
- sheet = book.sheet_by_index(0)
- for row_index in range(1,sheet.nrows):
- print(row_index,file)
- if len(str(sheet.cell_value(row_index,col_flag)))>0 and (int(sheet.cell_value(row_index,col_flag))>0):
- sql = " insert into "+table+"(entity_id,label) values('"+str(sheet.cell_value(row_index,col_entity_id))+"',"+str(int(sheet.cell_value(row_index,col_label)))+")"
- cursor.execute(sql)
- conn.commit()
- conn.close()
-
- def selectWrongDatasFromExcel():
- '''
- @summary:取出标注为错误的数据
- '''
- home = "./label/role_done/"
- data = []
- toExcel_file = "../../dl_dev/role/label/role_done/候选中因序号标错到无.xls"
- toExcel = xlwt.Workbook()
- toExcel_sheet = toExcel.add_sheet("错误标注到无",cell_overwrite_ok=True)
- row_head = ["entity_id","标注id","实体前","实体","实体后","角色","正确?(1-正确,0-错误)"]
- row_index_toExcel = 0
- style = set_style('Times New Roman',220,True)
- for i in range(len(row_head)):
- toExcel_sheet.write(row_index_toExcel,i,row_head[i],style)
-
- row_index_toExcel += 1
-
- for file in os.listdir(home):
- if os.path.isfile(home+file):
- if re.search(re.compile("第[一二三]"),file) is not None:
- book = xlrd.open_workbook(home+file)
- sheet = book.sheet_by_index(0)
-
- changeBook = copy(book)
- changeSheet = changeBook.get_sheet(0)
- for row_index in range(0,sheet.nrows):
- if re.search(re.compile("排名|排序|名次|第[123一二三]|(中标|成交)(人|单位|供应商)|成交情况"),str(sheet.cell_value(row_index,2))) is None:
- if re.search(re.compile("序号[::][123]"),str(sheet.cell_value(row_index,4))) is not None:
- print(file,sheet.row_values(row_index))
- changeSheet.write(row_index,6,0)
- row = sheet.row_values(row_index)
- row[1] = 5
- row[5] = "无"
- row[6] = 1
- for i in range(len(row)):
- toExcel_sheet.write(row_index_toExcel,i,row[i],style)
- row_index_toExcel += 1
- changeBook.save(home+"".join(file.split(".")[:-1])+"修改序号.xls")
- toExcel.save(toExcel_file)
-
- def exportHandLabelData():
- conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- sql = '''
- select A.entity_id,A.entity_text,A.begin_index,A.end_index,C.label,B.tokens
- from entity_mention A,sentences B ,hand_label_role C
- where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and A.entity_id=C.entity_id
- and A.entity_type in ('org','company')
- order by C.label
- '''
-
- cursor.execute(sql)
-
- rows = cursor.fetchall()
-
- list_entity_id = []
- list_before = []
- list_after = []
- list_text = []
- list_label = []
- repeat = set()
- for row in rows:
- entity_id = row[0]
- entity_text = row[1]
- begin_index = row[2]
- end_index = row[3]
- label = int(row[4])
- tokens = row[5]
-
- beforeafter = spanWindow(tokens, begin_index, end_index, 10)
-
- if ("".join(beforeafter[0]),entity_text,"".join(beforeafter[1])) in repeat:
- continue
-
- if str(label)!="5":
- continue
-
- repeat.add(("".join(beforeafter[0]),entity_text,"".join(beforeafter[1])))
-
- list_entity_id.append(entity_id)
- list_before.append("".join(beforeafter[0]))
- list_after.append("".join(beforeafter[1]))
- list_text.append(entity_text)
- list_label.append(label)
- print("len",len(list_entity_id))
- parts = 1
- parts_num = len(list_entity_id)//parts
- for i in range(parts-1):
-
- data = {"entity_id":list_entity_id[i*parts_num:(i+1)*parts_num],"list_before":list_before[i*parts_num:(i+1)*parts_num],"list_after":list_after[i*parts_num:(i+1)*parts_num],"list_text":list_text[i*parts_num:(i+1)*parts_num],"list_label":list_label[i*parts_num:(i+1)*parts_num]}
- df = pd.DataFrame(data)
- df.to_excel("原先标注数据_role_"+str(i)+".xls",columns=["entity_id","list_before","list_text","list_after","list_label"])
- i = parts - 1
- data = {"entity_id":list_entity_id[i*parts_num:],"list_before":list_before[i*parts_num:],"list_after":list_after[i*parts_num:],"list_text":list_text[i*parts_num:],"list_label":list_label[i*parts_num:]}
- df = pd.DataFrame(data)
- df.to_excel("角色无数据_role_"+str(i)+".xls",columns=["entity_id","list_before","list_text","list_after","list_label"])
-
- def selectWithRule(source,filter,target):
- assert target not in filter
- assert source!=target
- dict_source = pd.read_excel(source)
- set_filter = set()
- for filt in filter:
- set_filter = set_filter | set(pd.read_excel(filt)["entity_id"])
-
- list_entity_id = []
- list_before = []
- list_text = []
- list_after = []
- list_label = []
-
- for id,before,text,after,label in zip(dict_source["entity_id"],dict_source["list_before"],dict_source["list_text"],dict_source["list_after"],dict_source["list_label"]):
- if id in set_filter:
- continue
- if re.search("",str(before)) is not None:
- list_entity_id.append(id)
- list_before.append(before)
- list_text.append(text)
- list_after.append(after)
- list_label.append(label)
-
- data = {"entity_id":list_entity_id,"list_before":list_before,"list_text":list_text,"list_after":list_after,"list_label":list_label}
- columns = ["entity_id","list_before","list_text","list_after","list_label"]
-
- df = pd.DataFrame(data)
- df.to_excel(target,index=False,columns=columns)
-
- def importreHandLabelData():
- conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- table = "hand_label_role_0409"
- files = ["待检测原先标注数据_role_11.xls","待检查原先标注数据_role_00.xls","批量.xls"]
- for file in files:
- df = pd.read_excel(file)
- for entity_id,label,turn in zip(df["entity_id"],df["list_label"],df["turn"]):
- new_label = label
- #print(entity_id)
- if not math.isnan(turn):
- new_label = turn
- sql = " insert into "+table+"(entity_id,label) values('"+entity_id+"',"+str(new_label)+") "
- cursor.execute(sql)
- conn.commit()
- conn.close()
-
- def dumpData():
- conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- sql = " select B.entity_id,A.tokens,B.entity_text,B.begin_index,B.end_index,C.label from sentences A,entity_mention_copy B,hand_label_role_0409 C where B.entity_type in ('org','company') and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id "
- cursor.execute(sql)
-
- rows = cursor.fetchall()
-
- save(rows,"id_token_text_begin_end_label.pk")
-
- conn.close()
-
- def relabel():
- pkfiles = ["id_token_text_begin_end_label.pk","id_token_text_begin_end_label.pk1"]
-
- list_id = []
- list_before = []
- list_text = []
- list_after = []
- list_label = []
- for file in pkfiles:
- for row in load(file):
- id = row[0]
- token = row[1]
- text = row[2]
- begin = int(row[3])
- end = int(row[4])
- label = int(row[5])
- span = spanWindow(token, begin, end, size=10, center_include=True, word_flag=True)
- before = span[0]
- center = span[1]
- after = span[2]
- if re.search("中标人.{,3}$",before) is not None:
- list_id.append(id)
- list_before.append(before)
- list_text.append(center)
- list_after.append(after)
- list_label.append(label)
- df = pd.DataFrame({"list_id":list_id,"list_before":list_before,"list_text":list_text,"list_after":list_after,"list_label":list_label})
- df.to_excel("rule1.xls",columns=["list_id","list_before","list_text","list_after","list_label"],index=False)
-
- def importAfterrelabel():
- conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
-
- conn_1 = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor_1 = conn_1.cursor()
- df = pd.read_excel("rule1.xls")
- list_id = df["list_id"]
- list_label = df["list_label"]
- count = 0
- for id,label in zip(list_id,list_label):
- if re.search("比地",id) is not None:
- sql = " update turn_label set new_label='"+str(int(label))+"' where entity_id='"+id+"' "
- cursor_1.execute(sql)
- else:
- sql = " update hand_label_role_0409 set label="+str(int(label))+" where entity_id='"+id+"' "
- cursor.execute(sql)
- count += 1
- print("done",count)
- conn.commit()
- conn_1.commit()
-
- conn.close()
- conn_1.close()
-
-
-
- if __name__=="__main__":
- pass
- #labeling()
- #getDatasToExcel()
- #getDatasFromExcel()
- #selectWrongDatasFromExcel()
- #exportHandLabelData()
- #selectWithRule("角色无数据_role_0.xls",["批量.xls"],"公告公示.xls")
- #importreHandLabelData()
- #dumpData()
- #relabel()
- #importAfterrelabel()
|