getWrongData.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. from BiddingKG.dl.common.Utils import *
  2. import psycopg2
  3. import pandas as pd
  4. import math
  5. import glob
  6. from Entity2DB import *
  7. import BiddingKG.dl.interface.predictor as predictor
  8. import BiddingKG.dl.interface.Preprocessing as Preprocessing
  9. def getWrongData():
  10. def spanWindow(tokens,begin_index,end_index,size):
  11. '''
  12. @summary:取得某个实体的上下文词汇
  13. @param:
  14. tokens:句子分词list
  15. begin_index:实体的开始index
  16. end_index:实体的结束index
  17. size:左右两边各取多少个词
  18. @return: list,实体的上下文词汇
  19. '''
  20. length_tokens = len(tokens)
  21. if begin_index>size:
  22. begin = begin_index-size
  23. else:
  24. begin = 0
  25. if end_index+size<length_tokens:
  26. end = end_index+size+1
  27. else:
  28. end = length_tokens
  29. result = []
  30. result.append(tokens[begin:begin_index])
  31. result.append(tokens[begin_index:end_index+1])
  32. result.append(tokens[end_index+1:end])
  33. #print(result)
  34. return result
  35. files = []
  36. for file in glob.glob("C:\\Users\\User\\Desktop\\20190416要素\\*.html"):
  37. filename = file.split("\\")[-1]
  38. files.append(filename)
  39. conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
  40. cursor = conn.cursor()
  41. sql = '''
  42. select A.entity_id,A.entity_text,A.begin_index,A.end_index,A.label,A.values,B.tokens,A.doc_id
  43. from entity_mention A,sentences B
  44. where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index
  45. and A.entity_type in ('org','company')
  46. and A.handlabel='1'
  47. and A.label!='None'
  48. and not exists(select 1 from turn_label where entity_id=A.entity_id)
  49. order by A.label
  50. '''
  51. cursor.execute(sql)
  52. rows = cursor.fetchall()
  53. list_entity_id = []
  54. list_before = []
  55. list_after = []
  56. list_text = []
  57. list_label = []
  58. list_prob = []
  59. repeat = set()
  60. for row in rows:
  61. entity_id = row[0]
  62. #entity_text = row[1]
  63. begin_index = row[2]
  64. end_index = row[3]
  65. label = int(row[4])
  66. values = row[5][1:-1].split(",")
  67. tokens = row[6]
  68. doc_id = row[7]
  69. if doc_id not in files:
  70. continue
  71. if float(values[label])<0.5:
  72. continue
  73. beforeafter = spanWindow(tokens, begin_index, end_index, 10)
  74. if ("".join(beforeafter[0]),"".join(beforeafter[1]),"".join(beforeafter[2])) in repeat:
  75. continue
  76. repeat.add(("".join(beforeafter[0]),"".join(beforeafter[1]),"".join(beforeafter[2])))
  77. list_entity_id.append(entity_id)
  78. list_before.append("".join(beforeafter[0]))
  79. list_after.append("".join(beforeafter[2]))
  80. list_text.append("".join(beforeafter[1]))
  81. list_label.append(label)
  82. list_prob.append(values[label])
  83. print("len",len(list_entity_id))
  84. parts = 1
  85. parts_num = len(list_entity_id)//parts
  86. for i in range(parts-1):
  87. data = {"entity_id":list_entity_id[i*parts_num:(i+1)*parts_num],"list_before":list_before[i*parts_num:(i+1)*parts_num],"list_after":list_after[i*parts_num:(i+1)*parts_num],"list_text":list_text[i*parts_num:(i+1)*parts_num],"list_label":list_label[i*parts_num:(i+1)*parts_num],"list_prob":list_prob[i*parts_num:(i+1)*parts_num]}
  88. df = pd.DataFrame(data)
  89. df.to_excel("未标注错误_"+str(i)+".xls",columns=["entity_id","list_before","list_text","list_after","list_label","list_prob"])
  90. i = parts - 1
  91. data = {"entity_id":list_entity_id[i*parts_num:],"list_before":list_before[i*parts_num:],"list_after":list_after[i*parts_num:],"list_text":list_text[i*parts_num:],"list_label":list_label[i*parts_num:],"list_prob":list_prob[i*parts_num:]}
  92. df = pd.DataFrame(data)
  93. df.to_excel("测试数据_role1"+str(i)+".xls",columns=["entity_id","list_before","list_text","list_after","list_label","list_prob"])
  94. def importWrongDataOfRole():
  95. conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
  96. cursor = conn.cursor()
  97. parts = 2
  98. for i in range(parts):
  99. file = "wrong_role_"+str(i)+".xls"
  100. df = pd.read_excel(file)
  101. for entity_id,old_label,new_label in zip(df["entity_id"],df["list_label"],df["turn"]):
  102. '''
  103. if math.isnan(new_label):
  104. print(entity_id)
  105. '''
  106. sql = " insert into turn_label(entity_id,old_label,new_label) values('"+entity_id+"','"+str(int(old_label))+"','"+str(int(new_label))+"')"
  107. cursor.execute(sql)
  108. conn.commit()
  109. conn.close()
  110. def importTurnDataOfRole():
  111. conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
  112. cursor = conn.cursor()
  113. for file in glob.glob("done/*.xls"):
  114. if re.search("wrong",file) is not None:
  115. continue
  116. df = pd.read_excel(file)
  117. for entity_id,old_label,new_label in zip(df["entity_id"],df["list_label"],df["turn"]):
  118. ''''''
  119. if math.isnan(new_label):
  120. new_label = old_label
  121. print(entity_id)
  122. sql = " insert into turn_label(entity_id,old_label,new_label) values('"+entity_id+"','"+str(int(old_label))+"','"+str(int(new_label))+"')"
  123. cursor.execute(sql)
  124. conn.commit()
  125. conn.close()
  126. def selectWithRule(source,filter,target):
  127. assert source!=target
  128. dict_source = pd.read_excel(source)
  129. set_filter = set()
  130. for filt in filter:
  131. set_filter = set_filter | set(pd.read_excel(filt)["list_entityid"])
  132. list_entity_id = []
  133. list_before = []
  134. list_text = []
  135. list_after = []
  136. list_label = []
  137. selectdata = []
  138. for id,before,text,after,label in zip(dict_source["list_entityid"],dict_source["list_before"],dict_source["list_center"],dict_source["list_after"],dict_source["list_label"]):
  139. if id in set_filter:
  140. continue
  141. if re.search("",str(before)) is not None:
  142. selectdata.append([id,before,text,after,label])
  143. selectdata.sort(key=lambda x:x[4])
  144. for item in selectdata:
  145. list_entity_id.append(item[0])
  146. list_before.append(item[1])
  147. list_text.append(item[2])
  148. list_after.append(item[3])
  149. list_label.append(item[4])
  150. data = {"list_entityid":list_entity_id,"list_before":list_before,"list_text":list_text,"list_after":list_after,"list_label":list_label}
  151. columns = ["list_entityid","list_before","list_text","list_after","list_label"]
  152. df = pd.DataFrame(data)
  153. df.to_excel(target,index=False,columns=columns)
  154. def dumpData():
  155. files = []
  156. for file in glob.glob("C:\\Users\\User\\Desktop\\20190416要素\\*.html"):
  157. filename = file.split("\\")[-1]
  158. files.append(filename)
  159. conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
  160. cursor = conn.cursor()
  161. sql = " select B.entity_id,A.tokens,B.entity_text,B.begin_index,B.end_index,C.new_label from sentences_selffool A,entity_mention_selffool B,turn_label_selffool C where B.entity_type in ('org','company') and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id "
  162. cursor.execute(sql)
  163. rows = cursor.fetchall()
  164. data = []
  165. for row in rows:
  166. if "_".join(row[0].split("_")[:-3]) not in files:
  167. data.append(row)
  168. save(data,"id_token_text_begin_end_label-selffool.pk1")
  169. conn.close()
  170. def generateTrainData():
  171. codeNamePredict = predictor.CodeNamePredict()
  172. premPredict = predictor.PREMPredict()
  173. epcPredict = predictor.EPCPredict()
  174. roleRulePredict = predictor.RoleRulePredictor()
  175. conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
  176. cursor = conn.cursor()
  177. sql = " select id,content from articles_processed"
  178. cursor.execute(sql)
  179. data = cursor.fetchall()
  180. _count = 0
  181. for row in data[_count:]:
  182. try:
  183. _count += 1
  184. print(_count,len(data))
  185. doc_id = row[0]
  186. text = row[1]
  187. list_articles,list_sentences,list_entitys = Preprocessing.get_preprocessed([[doc_id,text]],useselffool=True)
  188. codeName = codeNamePredict.predict(list_articles)
  189. premPredict.predict(list_sentences,list_entitys)
  190. roleRulePredict.predict(list_sentences, list_entitys,codeName[0][1]["name"])
  191. epcPredict.predict(list_sentences,list_entitys)
  192. persistArticle(conn, list_articles,codeName)
  193. for sentences in list_sentences:
  194. persistSentence(conn, sentences)
  195. for entitys in list_entitys:
  196. persistEntity(conn, entitys)
  197. conn.commit()
  198. except Exception as e:
  199. print(doc_id,str(e))
  200. conn.close()
  201. conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
  202. cursor = conn.cursor()
  203. conn.close()
  204. def getDifferenctTrainData():
  205. files = ["id_token_text_begin_end_label.pk1"]
  206. train_set_before = set()
  207. for file in files:
  208. data = load(file)
  209. for row in data:
  210. _span = spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=10,center_include=True,word_flag=True,text=row[2])
  211. _label = row[5]
  212. item = (str(_span[0]),str(_span[1]),str(_span[2]),str(_label))
  213. train_set_before.add(item)
  214. '''
  215. conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
  216. cursor = conn.cursor()
  217. sql = " select B.entity_id,A.tokens,B.entity_text,B.begin_index,B.end_index,B.label from sentences_selffool A,entity_mention_selffool B,turn_label C where B.entity_type in ('org','company') and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and B.entity_id=C.entity_id "
  218. cursor.execute(sql)
  219. rows = cursor.fetchall()
  220. list_same_entityid = []
  221. list_same_before = []
  222. list_same_center = []
  223. list_same_after = []
  224. list_same_label = []
  225. list_notsame_entityid = []
  226. list_notsame_before = []
  227. list_notsame_center = []
  228. list_notsame_after = []
  229. list_notsame_label = []
  230. train_set_now = set()
  231. _index = 0
  232. for row in rows:
  233. _span = spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=10,center_include=True,word_flag=True,text=row[2])
  234. _label = row[5]
  235. item = (str(_span[0]),str(_span[1]),str(_span[2]),str(_label))
  236. if item in train_set_before:
  237. list_same_entityid.append(row[0])
  238. list_same_before.append(_span[0])
  239. list_same_center.append(_span[1])
  240. list_same_after.append(_span[2])
  241. list_same_label.append(str(_label))
  242. if len(list_same_entityid)>65000:
  243. data_same = {"list_entityid":list_same_entityid,
  244. "list_before":list_same_before,
  245. "list_center":list_same_center,
  246. "list_after":list_same_after,
  247. "list_label":list_same_label}
  248. df = pd.DataFrame(data_same,columns=["list_entityid","list_before","list_center","list_after","list_label"])
  249. df.to_excel("role_same"+str(_index)+".xls")
  250. _index += 1
  251. list_same_entityid = []
  252. list_same_before = []
  253. list_same_center = []
  254. list_same_after = []
  255. list_same_label = []
  256. else:
  257. if item not in train_set_now:
  258. list_notsame_entityid.append(row[0])
  259. list_notsame_before.append(_span[0])
  260. list_notsame_center.append(_span[1])
  261. list_notsame_after.append(_span[2])
  262. list_notsame_label.append(str(_label))
  263. train_set_now.add(item)
  264. data_same = {"list_entityid":list_same_entityid,
  265. "list_before":list_same_before,
  266. "list_center":list_same_center,
  267. "list_after":list_same_after,
  268. "list_label":list_same_label}
  269. df = pd.DataFrame(data_same,columns=["list_entityid","list_before","list_center","list_after","list_label"])
  270. df.to_excel("role_same"+str(_index)+".xls")
  271. data_notsame = {"list_entityid":list_notsame_entityid,
  272. "list_before":list_notsame_before,
  273. "list_center":list_notsame_center,
  274. "list_after":list_notsame_after,
  275. "list_label":list_notsame_label}
  276. df = pd.DataFrame(data_notsame,columns=["list_entityid","list_before","list_center","list_after","list_label"])
  277. df.to_excel("role_notsame.xls")
  278. '''
  279. _context_set = set()
  280. conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
  281. cursor = conn.cursor()
  282. sql = " select B.entity_id,A.tokens,B.entity_text,B.begin_index,B.end_index,B.label from sentences_selffool A,entity_mention_selffool B where B.entity_type in ('org','company') and A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and not exists(select 1 from entity_mention where entity_mention.entity_id=B.entity_id) "
  283. cursor.execute(sql)
  284. rows = cursor.fetchall()
  285. list_notexists_entityid = []
  286. list_notexists_before = []
  287. list_notexists_center = []
  288. list_notexists_after = []
  289. list_notexists_label = []
  290. rows.sort(key=lambda x:x[5])
  291. for row in rows:
  292. _span = spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=10,center_include=True,word_flag=True,text=row[2])
  293. _label = row[5]
  294. item = (str(_span[0]),str(_span[1]),str(_span[2]),str(_label))
  295. if item not in _context_set:
  296. list_notexists_entityid.append(row[0])
  297. list_notexists_before.append(_span[0])
  298. list_notexists_center.append(_span[1])
  299. list_notexists_after.append(_span[2])
  300. list_notexists_label.append(str(_label))
  301. _context_set.add(item)
  302. data_notexists = {"list_entityid":list_notexists_entityid,
  303. "list_before":list_notexists_before,
  304. "list_center":list_notexists_center,
  305. "list_after":list_notexists_after,
  306. "list_label":list_notexists_label}
  307. df = pd.DataFrame(data_notexists,columns=["list_entityid","list_before","list_center","list_after","list_label"])
  308. df.to_excel("role_notexists.xls")
  309. def updateTurnLabel():
  310. conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
  311. cursor = conn.cursor()
  312. df = pd.read_excel("批量.xls")
  313. for entity_id,label in zip(df["list_entityid"],df["list_label"]):
  314. sql = " update turn_label_selffool set new_label="+str(int(label))+" where entity_id='"+str(entity_id)+"' "
  315. cursor.execute(sql)
  316. conn.commit()
  317. conn.close()
  318. def importTurnLabel():
  319. conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
  320. cursor = conn.cursor()
  321. df = pd.read_excel("批量notexists.xls")
  322. for entity_id,label in zip(df["list_entityid"],df["list_label"]):
  323. sql = " insert into turn_label_selffool(entity_id,new_label) values('"+entity_id+"',"+str(int(label))+")"
  324. cursor.execute(sql)
  325. conn.commit()
  326. conn.close()
  327. #print(train_set_before)
  328. if __name__=="__main__":
  329. #getWrongData()
  330. #importWrongDataOfRole()
  331. #selectWithRule("role_notexists.xls",["批量notexists.xls"],"rule.xls")
  332. #importTurnDataOfRole()
  333. #dumpData()
  334. #generateTrainData()
  335. #getDifferenctTrainData()
  336. #updateTurnLabel()
  337. #importTurnLabel()
  338. a = load("id_token_text_begin_end_label.pk1")
  339. print(len(a))
  340. b = load("id_token_text_begin_end_label-selffool.pk1")
  341. print(len(b))