1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889 |
- '''
- Created on 2019年1月10日
- @author: User
- '''
- import psycopg2
- import codecs
- import re
- import os
- import pandas as pd
- from BiddingKG.dl.common.Utils import *
-
- def getDatasToExcel():
- '''
- @summary: 将预标注的数据导出到excel中
-
- '''
- list_entity_id = []
- list_label = []
- list_before = []
- list_center = []
- list_after = []
- list_label_text = []
- conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- sql = " select A.entity_id,A.label,A.entity_text,A.begin_index,A.end_index,B.tokens,case when A.label=1 then '招标联系人' when A.label=2 then '代理联系人' when A.label=3 then '联系人' else '无' end as link from predict_entity_copy A,predict_sentences_copy B where A.entity_type='person' and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index order by A.label"
- cursor.execute(sql)
- rows = cursor.fetchall()
-
- for row in rows:
- tokens = row[5]
- begin_index = row[3]
- end_index = row[4]
- entity_text = row[2]
- label_text = row[6]
- list_entity_id.append(row[0])
- list_label.append(str(row[1]))
- beforeafter = spanWindow(tokens,begin_index,end_index,10)
- list_before.append(beforeafter[0])
- list_center.append(entity_text)
- list_after.append(beforeafter[1])
- list_label_text.append(label_text)
-
-
- columns = ["id","label","before","center","after","label_text"]
-
- nums = 3
- parts = len(list_entity_id)//nums
-
- print(parts)
-
- i = 0
- while(i<nums-1):
- pdframe = pd.DataFrame({"id":list_entity_id[i*parts:(i+1)*parts],"label":list_label[i*parts:(i+1)*parts],"before":list_before[i*parts:(i+1)*parts],"center":list_center[i*parts:(i+1)*parts],"after":list_after[i*parts:(i+1)*parts],"label_text":list_label_text[i*parts:(i+1)*parts]})
- pdframe.to_excel("person_"+str(i)+".xls",columns = columns)
- i += 1
-
- pdframe = pd.DataFrame({"id":list_entity_id[i*parts:],"label":list_label[i*parts:],"before":list_before[i*parts:],"center":list_center[i*parts:],"after":list_after[i*parts:],"label_text":list_label_text[i*parts:]})
- pdframe.to_excel("person_"+str(i)+".xls",columns = columns)
-
- def getDatasFromExcel():
- '''
- @summary: 将人工标注好的数据从excel中导入到数据库中
- '''
- conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- home = "./label/"
- files = os.listdir(home)
- for file in files:
- data = pd.read_excel(home+file)
- list_entity_id = data['id']
- list_label = data['label']
- list_relabel = data['relabel']
- for i in range(len(list_entity_id)):
- if str(list_relabel[i])!="nan":
- label = str(int(list_relabel[i]))
- else:
- label = str(int(list_label[i]))
- entity_id = list_entity_id[i]
- sql = " insert into hand_label_person(entity_id,label) values('"+str(entity_id)+"',"+label+")"
- cursor.execute(sql)
- conn.commit()
- conn.close()
-
- if __name__=="__main__":
- #getDatasToExcel()
- getDatasFromExcel()
|