import psycopg2 import codecs import xlwt import re import os import xlrd import pandas as pd def getData(t="final_label_money"): conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() select_sql = " select A.doc_id,C.entity_id,C.label,case when C.label=0 then '招标人' when C.label=1 then '招标代理' when C.label=2 then '中标人/第一候选' when C.label=3 then '第二' when C.label=4 then '第三' else '无' end as 再标注,case when D.label=0 then '招标人' when D.label=1 then '招标代理' when D.label=2 then '中标人/第一候选' when D.label=3 then '第二' when D.label=4 then '第三' else '无' end as 原标注,B.entity_text,A.tokens[B.begin_index-10:B.begin_index],A.tokens[B.begin_index+1:B.end_index+1],A.tokens[B.end_index+2:B.end_index+12] " group_sql = " group by A.doc_id,C.entity_id,C.label,D.label,B.entity_text,B.begin_index,B.end_index,A.tokens,A.sentence_index " sql = select_sql+" from sentences A,entity_mention B,"+t+" C,label_guest_role D where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id and C.entity_id=D.entity_id and C.label!=D.label "+group_sql+"order by A.doc_id,A.sentence_index asc,D.label asc" cursor.execute(sql) result = [] rows = cursor.fetchall() for row in rows: item = [] for column in row: item.append(column) result.append(item) conn.close() return result def labeling(datas=getData()): sum = 0 row_index = 0 begin_doc_id = str(input("开始文章是:")) begin_index = 0 end_index = len(datas)-1 find_flag = False while(row_index0): #sql = " insert into "+table+"(entity_id,label) values('"+str(sheet.cell_value(row_index,col_entity_id))+"',"+str(int(sheet.cell_value(row_index,col_label)))+")" #cursor.execute(sql) print(str(sheet.cell_value(row_index,col_entity_id))+"',"+str(int(sheet.cell_value(row_index,col_label)))) conn.commit() conn.close() def getHandLabelData(): def spanWindow(tokens,begin_index,end_index,size): ''' @summary:取得某个实体的上下文词汇 @param: tokens:句子分词list begin_index:实体的开始index end_index:实体的结束index size:左右两边各取多少个词 @return: list,实体的上下文词汇 ''' length_tokens = len(tokens) if begin_index>size: begin = begin_index-size else: begin = 0 if end_index+size