getLabelDatas.py 3.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. '''
  2. Created on 2019年1月10日
  3. @author: User
  4. '''
  5. import psycopg2
  6. import codecs
  7. import re
  8. import os
  9. import pandas as pd
  10. from BiddingKG.dl.common.Utils import *
  11. def getDatasToExcel():
  12. '''
  13. @summary: 将预标注的数据导出到excel中
  14. '''
  15. list_entity_id = []
  16. list_label = []
  17. list_before = []
  18. list_center = []
  19. list_after = []
  20. list_label_text = []
  21. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  22. cursor = conn.cursor()
  23. sql = " select A.entity_id,A.label,A.entity_text,A.begin_index,A.end_index,B.tokens,case when A.label=1 then '招标联系人' when A.label=2 then '代理联系人' when A.label=3 then '联系人' else '无' end as link from predict_entity_copy A,predict_sentences_copy B where A.entity_type='person' and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index order by A.label"
  24. cursor.execute(sql)
  25. rows = cursor.fetchall()
  26. for row in rows:
  27. tokens = row[5]
  28. begin_index = row[3]
  29. end_index = row[4]
  30. entity_text = row[2]
  31. label_text = row[6]
  32. list_entity_id.append(row[0])
  33. list_label.append(str(row[1]))
  34. beforeafter = spanWindow(tokens,begin_index,end_index,10)
  35. list_before.append(beforeafter[0])
  36. list_center.append(entity_text)
  37. list_after.append(beforeafter[1])
  38. list_label_text.append(label_text)
  39. columns = ["id","label","before","center","after","label_text"]
  40. nums = 3
  41. parts = len(list_entity_id)//nums
  42. print(parts)
  43. i = 0
  44. while(i<nums-1):
  45. pdframe = pd.DataFrame({"id":list_entity_id[i*parts:(i+1)*parts],"label":list_label[i*parts:(i+1)*parts],"before":list_before[i*parts:(i+1)*parts],"center":list_center[i*parts:(i+1)*parts],"after":list_after[i*parts:(i+1)*parts],"label_text":list_label_text[i*parts:(i+1)*parts]})
  46. pdframe.to_excel("person_"+str(i)+".xls",columns = columns)
  47. i += 1
  48. pdframe = pd.DataFrame({"id":list_entity_id[i*parts:],"label":list_label[i*parts:],"before":list_before[i*parts:],"center":list_center[i*parts:],"after":list_after[i*parts:],"label_text":list_label_text[i*parts:]})
  49. pdframe.to_excel("person_"+str(i)+".xls",columns = columns)
  50. def getDatasFromExcel():
  51. '''
  52. @summary: 将人工标注好的数据从excel中导入到数据库中
  53. '''
  54. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  55. cursor = conn.cursor()
  56. home = "./label/"
  57. files = os.listdir(home)
  58. for file in files:
  59. data = pd.read_excel(home+file)
  60. list_entity_id = data['id']
  61. list_label = data['label']
  62. list_relabel = data['relabel']
  63. for i in range(len(list_entity_id)):
  64. if str(list_relabel[i])!="nan":
  65. label = str(int(list_relabel[i]))
  66. else:
  67. label = str(int(list_label[i]))
  68. entity_id = list_entity_id[i]
  69. sql = " insert into hand_label_person(entity_id,label) values('"+str(entity_id)+"',"+label+")"
  70. cursor.execute(sql)
  71. conn.commit()
  72. conn.close()
  73. if __name__=="__main__":
  74. #getDatasToExcel()
  75. getDatasFromExcel()