getDatas.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. import os
  2. import pandas as pd
  3. import pickle
  4. import psycopg2
  5. import codecs
  6. import re
  7. import fool
  8. def getHandLabelDatas():
  9. '''
  10. @summary:对使用jupyter标注的数据插入到数据库中
  11. '''
  12. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  13. cursor = conn.cursor()
  14. def load(path):
  15. '''
  16. 读取对象
  17. @Arugs:
  18. path: 读取的路径
  19. @Return:
  20. 读取的对象
  21. '''
  22. with open(path, 'rb') as f:
  23. object = pickle.load(f)
  24. return object
  25. for file in os.listdir(""):
  26. if file[-2:]=="pk":
  27. #if file[-9:-6] in ["l0.","l1.","l2.","l8."]:
  28. df = load("./"+file)
  29. for i in range(len(df)):
  30. if df.loc[i]['projectcode']!="" or df.loc[i]['projectname']!="":
  31. sql = " insert into project(doc_id,projectCode,projectName) values('"+df.loc[i]['doc_id']+"','"+df.loc[i]['projectcode']+"','"+df.loc[i]['projectname']+"')"
  32. #print(sql)
  33. print(file,i)
  34. cursor.execute(sql)
  35. conn.commit()
  36. conn.close()
  37. def getPredictCodeAndName():
  38. '''
  39. @summary:对模型的测试数据进行解析,判断模型的效果
  40. '''
  41. file = "predict_test.txt"
  42. with codecs.open(file,"r",encoding="utf8") as f:
  43. contents = f.read()
  44. data = [[row.split() for row in sample.split("\n")]for sample in contents.strip().split("\n\n")]
  45. '''
  46. with codecs.open("docid_content.txt","r",encoding="utf8") as f:
  47. docid_content = f.read()
  48. data_docid_content = [sample.split() for sample in docid_content.strip().split("\n")]
  49. print(len(data),len(data_docid_content))
  50. assert len(data)==len(data_docid_content)
  51. '''
  52. code_pattern = re.compile("PC_B,(PC_M,)+PC_E")
  53. name_pattern = re.compile("PN_B,(PN_M,)+PN_E")
  54. sum_label_code = 0
  55. sum_predict_code = 0
  56. sum_label_predict_code = 0
  57. sum_label_name = 0
  58. sum_predict_name = 0
  59. sum_label_predict_name = 0
  60. with codecs.open("projectcodename.html","w",encoding="utf8") as f:
  61. f.write('<html><head>\
  62. <meta http-equiv="Content-Type"\
  63. content="text/html; charset=UTF-8">\
  64. </head>\
  65. <body bgcolor="#FFFFFF">\
  66. <table border="1">\
  67. <tr>\
  68. <td>句子</td>\
  69. <td>标签编号</td>\
  70. <td>标签名称</td>\
  71. <td>预测标号</td>\
  72. <td>预测名称<</td>\
  73. </tr>')
  74. for i in range(len(data)):
  75. a,b,c = zip(*data[i])
  76. text = "".join(a)
  77. label = ",".join(b)
  78. predict = ",".join(c)
  79. label_code = []
  80. label_name = []
  81. predict_code = []
  82. predict_name = []
  83. for match in re.finditer(code_pattern,label):
  84. (match_begin,match_end) = match.span()
  85. text_begin = len(re.split(",",label[:match_begin]))-1
  86. text_length = len(re.split(",",label[match_begin:match_end]))
  87. label_code.append(text[text_begin:text_begin+text_length])
  88. for match in re.finditer(name_pattern,label):
  89. (match_begin,match_end) = match.span()
  90. text_begin = len(re.split(",",label[:match_begin]))-1
  91. text_length = len(re.split(",",label[match_begin:match_end]))
  92. label_name.append(text[text_begin:text_begin+text_length])
  93. for match in re.finditer(code_pattern,predict):
  94. (match_begin,match_end) = match.span()
  95. text_begin = len(re.split(",",predict[:match_begin]))-1
  96. text_length = len(re.split(",",predict[match_begin:match_end]))
  97. predict_code.append(text[text_begin:text_begin+text_length])
  98. for match in re.finditer(name_pattern,predict):
  99. (match_begin,match_end) = match.span()
  100. text_begin = len(re.split(",",predict[:match_begin]))-1
  101. text_length = len(re.split(",",predict[match_begin:match_end]))
  102. predict_name.append(text[text_begin:text_begin+text_length])
  103. if len(label_code)>0:
  104. sum_label_code += 1
  105. if len(predict_code)>0:
  106. sum_predict_code += 1
  107. if len(set(label_code)&set(predict_code))>0:
  108. sum_label_predict_code += 1
  109. if len(label_name)>0:
  110. sum_label_name += 1
  111. if len(predict_name)>0:
  112. sum_predict_name += 1
  113. if len(set(label_name)&set(predict_name))>0:
  114. sum_label_predict_name += 1
  115. #f.write("<td>"+str(docid_content[i][0])+"</td>"+"<td>"+str(docid_content[i][1])+"</td>"+"<td>"+str(";".join(label_code))+"</td>"+"<td>"+str(";".join(label_name))+"</td>"+"<td>"+str(";".join(predict_code))+"</td>"+"<td>"+str(";".join(predict_name))+"</td>")
  116. f.write("<td>"+text+"</td>"+"<td>"+str(";".join(label_code))+"</td>"+"<td>"+str(";".join(label_name))+"</td>"+"<td>"+str(";".join(predict_code))+"</td>"+"<td>"+str(";".join(predict_name))+"</td>")
  117. f.write("</tr>")
  118. f.write("\n")
  119. f.write('</tr>\
  120. </table>\
  121. </body>\
  122. </html>')
  123. print("sum_label_code:%d,sum_predict_code:%d,sum_label_predict_code:%s"%(sum_label_code,sum_predict_code,sum_label_predict_code))
  124. print("sum_label_name:%d,sum_predict_name:%d,sum_label_predict_name:%s"%(sum_label_name,sum_predict_name,sum_label_predict_name))
  125. f.flush()
  126. f.close()
  127. def relabelHandlabels():
  128. '''
  129. @summary:对标注的项目名称,若前面含有实体,则加入到项目名称中,使用fool在发现实体
  130. '''
  131. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  132. cursor = conn.cursor()
  133. sql = " select A.content,B.projectname,B.doc_id from articles_processed A,project B where A.id=B.doc_id "
  134. cursor.execute(sql)
  135. def findAllIndex(substr,wholestr):
  136. copystr = wholestr
  137. result = []
  138. indexappend = 0
  139. while(True):
  140. index = copystr.find(substr)
  141. if index<0:
  142. break
  143. else:
  144. result.append(indexappend+index)
  145. indexappend += index+len(substr)
  146. copystr = copystr[index+len(substr):]
  147. return result
  148. rows = cursor.fetchall()
  149. updateData = []
  150. row_index = 0
  151. for row in rows:
  152. print(len(rows),row_index)
  153. row_index += 1
  154. doc_id = row[2]
  155. name = row[1] if row[1]!="" else ""
  156. names = re.split("[;;]",name)
  157. contents = re.split("。",str(row[0]))
  158. first_len_name = len(names)
  159. for content in contents:
  160. for name in names:
  161. if len(name)==0:
  162. continue
  163. all_begin_index = findAllIndex(name,content)
  164. for begin_index in all_begin_index:
  165. if begin_index<30:
  166. test_text = content[:begin_index]
  167. else:
  168. test_text = content[begin_index-30:begin_index]
  169. entitys = fool.ner(test_text)[0]
  170. for entity in entitys:
  171. if len(entity)==0:
  172. continue
  173. if int(entity[1])==len(test_text)+1 and entity[2] in ["org","company"]:
  174. if entity[3]+name not in names:
  175. names.append(entity[3]+name)
  176. if len(names)>first_len_name:
  177. data_item = [doc_id,";".join(names)]
  178. updateData.append(data_item)
  179. print("lenUpdatedata:",len(updateData))
  180. for item in updateData:
  181. sql = " insert into relabelproject(doc_id,names) values('"+item[0]+"','"+item[1]+"')"
  182. cursor.execute(sql)
  183. conn.commit()
  184. conn.close()
  185. if __name__=="__main__":
  186. #getHandLabelDatas()
  187. getPredictCodeAndName()
  188. #relabelHandlabels()