money_labeling.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. import psycopg2
  2. import codecs
  3. import xlwt
  4. import re
  5. import os
  6. import xlrd
  7. import pandas as pd
  8. def getData(t="final_label_money"):
  9. conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
  10. cursor = conn.cursor()
  11. select_sql = " select A.doc_id,C.entity_id,C.label,case when C.label=0 then '招标人' when C.label=1 then '招标代理' when C.label=2 then '中标人/第一候选' when C.label=3 then '第二' when C.label=4 then '第三' else '无' end as 再标注,case when D.label=0 then '招标人' when D.label=1 then '招标代理' when D.label=2 then '中标人/第一候选' when D.label=3 then '第二' when D.label=4 then '第三' else '无' end as 原标注,B.entity_text,A.tokens[B.begin_index-10:B.begin_index],A.tokens[B.begin_index+1:B.end_index+1],A.tokens[B.end_index+2:B.end_index+12] "
  12. group_sql = " group by A.doc_id,C.entity_id,C.label,D.label,B.entity_text,B.begin_index,B.end_index,A.tokens,A.sentence_index "
  13. sql = select_sql+" from sentences A,entity_mention B,"+t+" C,label_guest_role D where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id and C.entity_id=D.entity_id and C.label!=D.label "+group_sql+"order by A.doc_id,A.sentence_index asc,D.label asc"
  14. cursor.execute(sql)
  15. result = []
  16. rows = cursor.fetchall()
  17. for row in rows:
  18. item = []
  19. for column in row:
  20. item.append(column)
  21. result.append(item)
  22. conn.close()
  23. return result
  24. def labeling(datas=getData()):
  25. sum = 0
  26. row_index = 0
  27. begin_doc_id = str(input("开始文章是:"))
  28. begin_index = 0
  29. end_index = len(datas)-1
  30. find_flag = False
  31. while(row_index<len(datas)):
  32. row = datas[row_index]
  33. if begin_doc_id!="" and begin_index==0:
  34. if row[0]==begin_doc_id:
  35. begin_index = row_index
  36. else:
  37. row_index += 1
  38. continue
  39. find_flag = True
  40. print(row[0])
  41. print(row[3],row[4],row[5])
  42. print("before",row[6])
  43. print("entity",row[7])
  44. print("after",row[8])
  45. while(True):
  46. l = str(input("标签为:"))
  47. if l in ["0","1","2","3","4","5","","8","9"]:
  48. break
  49. if l=="0":
  50. row[2] = 0
  51. elif l=="1":
  52. row[2] = 1
  53. elif l=="2":
  54. row[2] = 2
  55. elif l=="3":
  56. row[2] = 3
  57. elif l=="4":
  58. row[2] = 4
  59. elif l=="5":
  60. row[2] = 5
  61. elif l=="":
  62. pass
  63. elif l=="8":
  64. row_index -= 1
  65. sum -= 1
  66. continue
  67. elif l=="9":
  68. end_index = row_index-1
  69. break
  70. sum += 1
  71. row_index += 1
  72. print("sum:",sum)
  73. if find_flag:
  74. with codecs.open("relabel.txt","a",encoding="utf8") as f:
  75. for row in datas[begin_index:end_index+1]:
  76. f.write(str(row[1]))
  77. f.write("\t")
  78. f.write(str(row[2]))
  79. f.write("\n")
  80. f.flush()
  81. f.close()
  82. #设置表格样式
  83. def set_style(name,height,bold=False):
  84. style = xlwt.XFStyle()
  85. font = xlwt.Font()
  86. font.name = name
  87. font.bold = bold
  88. font.color_index = 4
  89. font.height = height
  90. style.font = font
  91. return style
  92. #将数据保存到excel中进行标注
  93. def getDatasToExcel():
  94. moneys = ["0_预算金额","1_中投标","2_无(非预算及中投标金额)"]
  95. conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
  96. cursor = conn.cursor()
  97. nums = 3
  98. for money in moneys:
  99. select_sql = " select C.entity_id,C.label,A.tokens[B.begin_index-10:B.begin_index],A.tokens[B.begin_index+1:B.end_index+1],A.tokens[B.end_index+2:B.end_index+12],case when C.label=0 then '预算金额' when C.label=1 then '中/投标' else '无(非预算及中投标金额)' end as 再标注 "
  100. sql = select_sql+" from sentences A,entity_mention_copy B,hand_label_money C where B.entity_type='money' and C.label="+money.split("_")[0]+" and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id order by C.label asc,A.doc_id,A.sentence_index asc limit 7000"
  101. pattern_tenderee = re.compile("报价上限|限价|造价|控制(总?价|金额)|预算|概算|(?:造?价|投资|规模)预?估?算|预?估算?(?:造?价|投资|规模|金额)|(?:总|项目|计划)(?:[估预概]算|投资)|(?:投资|采购)(?:单价|总)?(?:额|金额)|投资约")
  102. pattern_wintenderer = re.compile("[\((]?(?:中标|成交|评标|评审|投标|报价|合同|入围)[\),)]?(候选人|单位|人|候选单位|供应商|候选)?后?[单总]?(?:价|金额|价格|报价|标价)|报价|第[一二三](中标|投标|候选|名|成交)|(公司\s*[::]?$)")
  103. print(sql)
  104. cursor.execute(sql)
  105. rows = cursor.fetchall()
  106. parts = len(rows)//3
  107. for nums_i in range(nums):
  108. file = xlwt.Workbook()
  109. sheet = file.add_sheet("标注"+money,cell_overwrite_ok=True)
  110. row_head = ["entity_id","标注id","实体前","实体","实体后","角色","找茬-错误标0(正确留空)"]
  111. row_index = 0
  112. style = set_style('Times New Roman',220,True)
  113. for i in range(len(row_head)):
  114. sheet.write(row_index,i,row_head[i],style)
  115. row_index += 1
  116. if nums_i<nums-1:
  117. for row in rows[nums_i*parts:(nums_i+1)*parts]:
  118. for i in range(len(row)):
  119. sheet.write(row_index,i,row[i],style)
  120. row_index += 1
  121. else:
  122. for row in rows[nums_i*parts:]:
  123. for i in range(len(row)):
  124. sheet.write(row_index,i,row[i],style)
  125. row_index += 1
  126. file.save("标注"+money.split("_")[1]+str(nums_i)+".xls")
  127. conn.close()
  128. def getDatasToExcel_centainity():
  129. moneys = ["0_预算金额","1_中投标","2_无(非预算及中投标金额)"]
  130. conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
  131. cursor = conn.cursor()
  132. nums = 3
  133. for money in moneys:
  134. select_sql = " select C.entity_id,C.label,A.tokens[B.begin_index-10:B.begin_index],A.tokens[B.begin_index+1:B.end_index+1],A.tokens[B.end_index+2:B.end_index+12],case when C.label=0 then '预算金额' when C.label=1 then '中/投标' else '无(非预算及中投标金额)' end as 再标注 "
  135. sql = select_sql+" from sentences A,entity_mention B,final_label_money C where B.entity_type='money' and C.label="+money.split("_")[0]+" and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id order by C.label asc,A.doc_id,A.sentence_index asc limit 7000"
  136. print(sql)
  137. cursor.execute(sql)
  138. rows = cursor.fetchall()
  139. parts = len(rows)//3
  140. for nums_i in range(nums):
  141. file = xlwt.Workbook()
  142. sheet = file.add_sheet("标注"+money,cell_overwrite_ok=True)
  143. row_head = ["entity_id","标注id","实体前","实体","实体后","角色","找茬-错误标0(正确留空)"]
  144. row_index = 0
  145. style = set_style('Times New Roman',220,True)
  146. for i in range(len(row_head)):
  147. sheet.write(row_index,i,row_head[i],style)
  148. row_index += 1
  149. if nums_i<nums-1:
  150. for row in rows[nums_i*parts:(nums_i+1)*parts]:
  151. for i in range(len(row)):
  152. sheet.write(row_index,i,row[i],style)
  153. row_index += 1
  154. else:
  155. for row in rows[nums_i*parts:]:
  156. for i in range(len(row)):
  157. sheet.write(row_index,i,row[i],style)
  158. row_index += 1
  159. file.save("标注"+money.split("_")[1]+str(nums_i)+".xls")
  160. conn.close()
  161. #从excel中获取标注数据
  162. def getDatasFromExcel():
  163. home = "./label_done/"
  164. col_entity_id = 0
  165. col_label = 1
  166. col_flag = 6
  167. table = "hand_label_money"
  168. conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
  169. cursor = conn.cursor()
  170. cursor.execute(" select to_regclass('"+table+"') is null ")
  171. notExists = cursor.fetchall()[0][0]
  172. '''
  173. if notExists:
  174. cursor.execute(" create table "+table+" (entity_id text,label int)")
  175. else:
  176. cursor.execute(" delete from "+table)
  177. conn.commit()
  178. '''
  179. for file in os.listdir(home):
  180. if os.path.isfile(home+file):
  181. book = xlrd.open_workbook(home+file)
  182. sheet = book.sheet_by_index(0)
  183. for row_index in range(1,sheet.nrows):
  184. print(row_index,file)
  185. if len(re.sub('\s*','',str(sheet.cell_value(row_index,col_flag))))==0 or (int(sheet.cell_value(row_index,col_flag))>0):
  186. #sql = " insert into "+table+"(entity_id,label) values('"+str(sheet.cell_value(row_index,col_entity_id))+"',"+str(int(sheet.cell_value(row_index,col_label)))+")"
  187. #cursor.execute(sql)
  188. print(str(sheet.cell_value(row_index,col_entity_id))+"',"+str(int(sheet.cell_value(row_index,col_label))))
  189. conn.commit()
  190. conn.close()
  191. def getHandLabelData():
  192. def spanWindow(tokens,begin_index,end_index,size):
  193. '''
  194. @summary:取得某个实体的上下文词汇
  195. @param:
  196. tokens:句子分词list
  197. begin_index:实体的开始index
  198. end_index:实体的结束index
  199. size:左右两边各取多少个词
  200. @return: list,实体的上下文词汇
  201. '''
  202. length_tokens = len(tokens)
  203. if begin_index>size:
  204. begin = begin_index-size
  205. else:
  206. begin = 0
  207. if end_index+size<length_tokens:
  208. end = end_index+size+1
  209. else:
  210. end = length_tokens
  211. result = []
  212. result.append(tokens[begin:begin_index])
  213. result.append(tokens[begin_index:end_index+1])
  214. result.append(tokens[end_index+1:end])
  215. #print(result)
  216. return result
  217. conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
  218. cursor = conn.cursor()
  219. sql = '''
  220. select A.entity_id,A.entity_text,A.begin_index,A.end_index,C.label,B.tokens
  221. from entity_mention_copy A,sentences B,hand_label_money C
  222. where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index
  223. and A.entity_type in ('money')
  224. and A.entity_id=C.entity_id
  225. order by C.label
  226. '''
  227. cursor.execute(sql)
  228. rows = cursor.fetchall()
  229. list_entity_id = []
  230. list_before = []
  231. list_after = []
  232. list_text = []
  233. list_label = []
  234. repeat = set()
  235. for row in rows:
  236. entity_id = row[0]
  237. #entity_text = row[1]
  238. begin_index = row[2]
  239. end_index = row[3]
  240. label = int(row[4])
  241. tokens = row[5]
  242. beforeafter = spanWindow(tokens, begin_index, end_index, 10)
  243. if ("".join(beforeafter[0]),"".join(beforeafter[1]),"".join(beforeafter[2])) in repeat:
  244. continue
  245. repeat.add(("".join(beforeafter[0]),"".join(beforeafter[1]),"".join(beforeafter[2])))
  246. list_entity_id.append(entity_id)
  247. list_before.append("".join(beforeafter[0]))
  248. list_after.append("".join(beforeafter[2]))
  249. list_text.append("".join(beforeafter[1]))
  250. list_label.append(label)
  251. print("len",len(list_entity_id))
  252. parts = 1
  253. parts_num = len(list_entity_id)//parts
  254. for i in range(parts-1):
  255. data = {"entity_id":list_entity_id[i*parts_num:(i+1)*parts_num],"list_before":list_before[i*parts_num:(i+1)*parts_num],"list_after":list_after[i*parts_num:(i+1)*parts_num],"list_text":list_text[i*parts_num:(i+1)*parts_num],"list_label":list_label[i*parts_num:(i+1)*parts_num]}
  256. df = pd.DataFrame(data)
  257. df.to_excel("未标注错误_"+str(i)+".xls",columns=["entity_id","list_before","list_text","list_after","list_label","list_prob"])
  258. i = parts - 1
  259. data = {"entity_id":list_entity_id[i*parts_num:],"list_before":list_before[i*parts_num:],"list_after":list_after[i*parts_num:],"list_text":list_text[i*parts_num:],"list_label":list_label[i*parts_num:]}
  260. df = pd.DataFrame(data)
  261. df.to_excel("wrong_money_"+str(i)+".xls",columns=["entity_id","list_before","list_text","list_after","list_label","list_prob"])
  262. if __name__=="__main__":
  263. #labeling()
  264. #getDatasToExcel()
  265. #getDatasFromExcel()
  266. getHandLabelData()