label.py 75 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764
  1. '''
  2. Created on 2019年6月4日
  3. @author: User
  4. '''
  5. import fool
  6. #import BiddingKG.dl.interface.Preprocessing as Preprocessing
  7. from bs4 import BeautifulSoup
  8. import re
  9. import codecs
  10. from BiddingKG.dl.common.Utils import save,load, findAllIndex
  11. import glob
  12. import threading
  13. import numpy as np
  14. import time
  15. from zipfile import ZipFile
  16. import json
  17. import psycopg2
  18. import pandas as pd
  19. import math
  20. from BiddingKG.dl.foolnltk.bi_lstm_crf import BiLSTM
  21. import copy
  22. from BiddingKG.dl.interface.Entitys import *
  23. from BiddingKG.dl.foolnltk.Entity2DB import *
  24. import tensorflow as tf
  25. import requests
  26. def getNers(sentences,MAXAREA = 100000,userselffool=False):
  27. '''
  28. @param: sentences:句子数
  29. @return 限流执行后的分词和实体识别list
  30. '''
  31. def getData(ners,process_data):
  32. process_sentences = [item[1] for item in process_data]
  33. if userselffool:
  34. ner_ = Preprocessing.selffool.ner(process_sentences)
  35. else:
  36. ner_ = fool.ner(process_sentences)
  37. for i in range(len(ner_)):
  38. the_index = process_data[i][0]
  39. ners[the_index] = ner_[i]
  40. sents = []
  41. for i in range(len(sentences)):
  42. sents.append([i,sentences[i]])
  43. sents.sort(key=lambda x:len(x[1]),reverse=True)
  44. index_ = 0
  45. ners = [[]for i in range(len(sentences))]
  46. while(True):
  47. width = len(sents[index_][1])
  48. height = MAXAREA//width+1
  49. if height>len(sents)-index_:
  50. height = len(sents)-index_
  51. process_data = sents[index_:index_+height]
  52. getData(ners, process_data)
  53. index_ += height
  54. if index_>=len(sents):
  55. break
  56. return ners
  57. def preprocess(list_articles):
  58. '''
  59. @summary: 预处理文本,将foolnltk的识别结果存储到数据库,方便查看和修正
  60. '''
  61. import psycopg2
  62. conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
  63. list_filename_text = []
  64. cursor = conn.cursor()
  65. for article in list_articles:
  66. doc_id = article[0]
  67. text = Preprocessing.segment(Preprocessing.tableToText(BeautifulSoup(article[1],"lxml")))
  68. list_filename_text.append([doc_id,text,article[2]])
  69. list_sent = []
  70. for x in re.split("[。]", text):
  71. if len(x)>0:
  72. list_sent.append(x+"。")
  73. for n in getNers(list_sent):
  74. for _entity in n:
  75. print(_entity)
  76. sql = " insert into fool_ner_train(filename,begin_index,end_index,type,text) values('"+str(doc_id)+"',"+str(_entity[0])+","+str(_entity[1])+",'"+str(_entity[2])+"','"+str(_entity[3])+"')"
  77. cursor.execute(sql)
  78. conn.commit()
  79. conn.close()
  80. return list_filename_text
  81. def hasNotBeenLabeled(items,code_begin,code):
  82. for i in range(code_begin,code_begin+len(code)):
  83. if items[i][1]!="O":
  84. return False
  85. return True
  86. def findAllIndex(substr,wholestr):
  87. copystr = wholestr
  88. result = []
  89. indexappend = 0
  90. while(True):
  91. index = copystr.find(substr)
  92. if index<0:
  93. break
  94. else:
  95. result.append(indexappend+index)
  96. indexappend += index+len(substr)
  97. copystr = copystr[index+len(substr):]
  98. return result
  99. def labelEntity():
  100. '''
  101. @summary: 标注数据,从数据库中查询实体信息,生成对文本生成标签数据
  102. '''
  103. import psycopg2
  104. conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
  105. cursor = conn.cursor()
  106. list_filename_text = load("list_filename_text_wrongEntity.pk")
  107. list_sent_label = []
  108. list_text_label = []
  109. sql = " select distinct filename from fool_ner t where not exists(select 1 from fool_ner a where t.filename=a.filename and type_0 in('org','company') and new_type is NULL) "
  110. cursor.execute(sql)
  111. set_filename = set()
  112. for row in cursor.fetchall():
  113. set_filename.add(row[0])
  114. for filename_text in list_filename_text:
  115. filename = filename_text[0]
  116. text = filename_text[1]
  117. if filename not in set_filename:
  118. continue
  119. sql = " select text,type_0,new_text,new_type from fool_ner where filename='"+filename+"' group by text,type_0,new_text,new_type"
  120. print(sql)
  121. cursor.execute(sql)
  122. rows = cursor.fetchall()
  123. rows.sort(key=lambda x:len(x[0]))
  124. list_entity = []
  125. for row in rows:
  126. entity = row[0]
  127. type = row[1]
  128. new_entity = row[2]
  129. new_type = row[3]
  130. _entitys = []
  131. if new_type is None or new_type=="" or new_type=="nan":
  132. _type = type
  133. else:
  134. _type = new_type
  135. if new_entity=="1":
  136. continue
  137. elif new_entity is None or new_entity =="" or new_entity=="nan":
  138. list_entity.append([entity,_type])
  139. _entitys.append([entity,_type])
  140. else:
  141. for _entity in new_entity.split("##"):
  142. list_entity.append([_entity,_type])
  143. _entitys.append([_entity,_type])
  144. if len(_entitys)>=2:
  145. data_item = []
  146. for i in range(len(str(entity))):
  147. _item = []
  148. _item.append(entity[i])
  149. _item.append("O")
  150. data_item.append(_item)
  151. for _entity_type in _entitys:
  152. _entity = _entity_type[0]
  153. _type = _entity_type[1]
  154. if _type not in ["person","company","org","job","time","location"]:
  155. continue
  156. for _index in findAllIndex(_entity, entity):
  157. _find_flag = True
  158. if len(_entity)==1:
  159. if hasNotBeenLabeled(data_item, _index, _entity):
  160. data_item[_index][1] = "S_"+_type
  161. else:
  162. if hasNotBeenLabeled(data_item, _index, _entity):
  163. for j in range(_index,_index+len(_entity)):
  164. if j==_index:
  165. data_item[j][1] = "B_"+_type
  166. elif j==_index+len(_entity)-1:
  167. data_item[j][1] = "E_"+_type
  168. else:
  169. data_item[j][1] = "M_"+_type
  170. if _find_flag:
  171. list_text_label.append(data_item)
  172. list_insert = [" ","根据","就","受",",",",","。",":",":","#","&","$","、","/","-","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","RR","S","TA","U","V","Wa","X","YG","Z","a","b","c","d","e","f","g"]
  173. for insert_item in list_insert:
  174. if np.random.random()>0.7:
  175. copy_data_item = copy.copy(data_item)
  176. list_index = []
  177. for i in range(len(copy_data_item)):
  178. _split = copy_data_item[i][1].split("_")
  179. if len(_split)==2:
  180. if _split[0]=="B":
  181. list_index.append(i)
  182. if _split[0]=="E":
  183. list_index.append(i+1)
  184. list_index.sort(key=lambda x:x,reverse=True)
  185. for _index in list_index:
  186. if np.random.random()>0.5:
  187. for j in range(len(insert_item)):
  188. copy_data_item.insert(_index+j,[insert_item[j],"O"])
  189. if np.random.random()>0.5:
  190. break
  191. list_text_label.append(copy_data_item)
  192. ''''''
  193. list_entity.sort(key=lambda x:len(x[0]),reverse=True)
  194. for _sent in text.split("。"):
  195. _sent+= "。"
  196. _find_flag = False
  197. data_item = []
  198. for i in range(len(str(_sent))):
  199. _item = []
  200. _item.append(_sent[i])
  201. _item.append("O")
  202. data_item.append(_item)
  203. for _entity_type in list_entity:
  204. _entity = _entity_type[0]
  205. _type = _entity_type[1]
  206. if _type not in ["person","company","org","job","time","location"]:
  207. continue
  208. for _index in findAllIndex(_entity, _sent):
  209. _find_flag = True
  210. if len(_entity)==1:
  211. if hasNotBeenLabeled(data_item, _index, _entity):
  212. data_item[_index][1] = "S_"+_type
  213. else:
  214. if hasNotBeenLabeled(data_item, _index, _entity):
  215. for j in range(_index,_index+len(_entity)):
  216. if j==_index:
  217. data_item[j][1] = "B_"+_type
  218. elif j==_index+len(_entity)-1:
  219. data_item[j][1] = "E_"+_type
  220. else:
  221. data_item[j][1] = "M_"+_type
  222. #根据句子中是否包含实体来判断是否加入训练数据
  223. if _find_flag:
  224. list_sent_label.append(data_item)
  225. else:
  226. if np.random.random()>0.9:
  227. list_sent_label.append(data_item)
  228. ''''''
  229. with codecs.open("ner_label.txt","w",encoding="utf8") as f:
  230. for _sent_label in list_sent_label:
  231. for _word,_label in _sent_label:
  232. f.write(_word+" "+_label+"\n")
  233. f.write("\n")
  234. f.flush()
  235. with codecs.open("ner_label_split.txt","w",encoding="utf8") as f:
  236. for _sent_label in list_text_label:
  237. for _word,_label in _sent_label:
  238. f.write(_word+" "+_label+"\n")
  239. f.write("\n")
  240. f.flush()
  241. return list_sent_label
  242. class MyThread(threading.Thread):
  243. def __init__(self,func,args=()):
  244. super(MyThread,self).__init__()
  245. self.func = func
  246. self.args = args
  247. def run(self):
  248. self.result = self.func(*self.args)
  249. def get_result(self):
  250. try:
  251. return self.result # 如果子线程不使用join方法,此处可能会报没有self.result的错误
  252. except Exception:
  253. return None
  254. def deal():
  255. list_articles = []
  256. path = "C:\\Users\\User\\Desktop\\fool语料\\*.html"
  257. set_doc_id = set()
  258. for file in glob.glob(path):
  259. filename = file.split("\\")[-1]
  260. doc_id = filename.split("_")[-1][:-5]
  261. text = codecs.open(file,"r",encoding="utf8").read()
  262. wrong_entity = "".join(filename.split("_")[:-1])
  263. if doc_id in set_doc_id:
  264. for item in list_articles:
  265. if doc_id==item[0]:
  266. item[2].append(wrong_entity)
  267. else:
  268. set_doc_id.add(doc_id)
  269. list_articles.append([doc_id,text,[wrong_entity]])
  270. save(list_articles,"list_filename_html_wrongEntity.pk")
  271. def dataSplit(data,parts=2):
  272. _index = 0
  273. part_len = len(data)//parts
  274. while(True):
  275. if _index+part_len<len(data):
  276. yield data[_index:_index+part_len]
  277. _index += part_len
  278. else:
  279. yield data[_index:]
  280. break
  281. def makeFoolTrainData():
  282. '''
  283. @summary: 生成fool训练数据
  284. '''
  285. import psycopg2
  286. conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
  287. list_articles = []
  288. #list_path = ["C:\\Users\\User\\Desktop\\20190306要素\\*.html","C:\\Users\\User\\Desktop\\20190320要素\\*.html"]
  289. list_path = ["C:\\Users\\User\\Desktop\\data_20190703\\*.html"]
  290. set_doc_id = set()
  291. for path in list_path:
  292. for file in glob.glob(path):
  293. filename = file.split("\\")[-1]
  294. text = codecs.open(file,"r",encoding="utf8").read()
  295. if filename in set_doc_id:
  296. continue
  297. else:
  298. set_doc_id.add(filename)
  299. list_articles.append([filename,text])
  300. list_filename_text = []
  301. cursor = conn.cursor()
  302. _count = 0
  303. for article in list_articles:
  304. _count += 1
  305. print(str(_count)+"/"+str(len(list_articles)))
  306. doc_id = article[0]
  307. text = Preprocessing.segment(Preprocessing.tableToText(BeautifulSoup(article[1],"lxml")))
  308. list_filename_text.append([doc_id,text])
  309. list_sent = []
  310. for x in re.split("[。]", text):
  311. if len(x)>0:
  312. list_sent.append(x+"。")
  313. for n in getNers(list_sent,userselffool=True):
  314. for _entity in n:
  315. sql = " insert into fool_ner_train_1(filename,begin_index,end_index,type_0,text) values('"+str(doc_id)+"',"+str(_entity[0])+","+str(_entity[1])+",'"+str(_entity[2])+"','"+str(_entity[3])+"')"
  316. cursor.execute(sql)
  317. conn.commit()
  318. conn.close()
  319. save(list_filename_text,"list_filename_text_train_1.pk")
  320. return list_filename_text
  321. def makeLabel(sent,list_entity_type):
  322. _find_flag = False
  323. data_item = []
  324. list_entity_type.sort(key=lambda x:len(x[0]),reverse=True)
  325. for i in range(len(str(sent))):
  326. _item = []
  327. _item.append(sent[i])
  328. _item.append("O")
  329. data_item.append(_item)
  330. for _entity_type in list_entity_type:
  331. _entity = _entity_type[0]
  332. _type = _entity_type[1]
  333. if _type not in ["person","company","org","job","time","location"]:
  334. continue
  335. for _index in findAllIndex(_entity, sent):
  336. _find_flag = True
  337. if len(_entity)==1:
  338. if hasNotBeenLabeled(data_item, _index, _entity):
  339. data_item[_index][1] = "S_"+_type
  340. else:
  341. if hasNotBeenLabeled(data_item, _index, _entity):
  342. for j in range(_index,_index+len(_entity)):
  343. if j==_index:
  344. data_item[j][1] = "B_"+_type
  345. elif j==_index+len(_entity)-1:
  346. data_item[j][1] = "E_"+_type
  347. else:
  348. data_item[j][1] = "M_"+_type
  349. return data_item,_find_flag
  350. def makeTrainTxt():
  351. '''
  352. @summary: 生成训练数据文本
  353. '''
  354. import psycopg2
  355. conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
  356. cursor = conn.cursor()
  357. list_filename_text = load("list_filename_text_train.pk")
  358. list_sent_label = []
  359. list_text_label = []
  360. for filename_text in list_filename_text:
  361. filename = filename_text[0]
  362. text = filename_text[1]
  363. sql = " select text,type_0,new_text,new_type from fool_ner_train where filename='"+filename+"' group by text,type_0,new_text,new_type"
  364. print(sql)
  365. cursor.execute(sql)
  366. rows = cursor.fetchall()
  367. rows.sort(key=lambda x:len(x[0]))
  368. list_entity = []
  369. for row in rows:
  370. entity = row[0]
  371. type = row[1]
  372. new_entity = row[2]
  373. new_type = row[3]
  374. _entitys = []
  375. if new_type is None or new_type=="" or new_type=="nan":
  376. _type = type
  377. else:
  378. _type = new_type
  379. if new_entity=="1":
  380. continue
  381. elif new_entity is None or new_entity =="" or new_entity=="nan":
  382. list_entity.append([entity,_type])
  383. _entitys.append([entity,_type])
  384. else:
  385. for _entity in new_entity.split("##"):
  386. list_entity.append([_entity,_type])
  387. _entitys.append([_entity,_type])
  388. if len(_entitys)>=2:
  389. data_item = []
  390. for i in range(len(str(entity))):
  391. _item = []
  392. _item.append(entity[i])
  393. _item.append("O")
  394. data_item.append(_item)
  395. for _entity_type in _entitys:
  396. _entity = _entity_type[0]
  397. _type = _entity_type[1]
  398. if _type not in ["person","company","org","job","time","location"]:
  399. continue
  400. for _index in findAllIndex(_entity, entity):
  401. _find_flag = True
  402. if len(_entity)==1:
  403. if hasNotBeenLabeled(data_item, _index, _entity):
  404. data_item[_index][1] = "S_"+_type
  405. else:
  406. if hasNotBeenLabeled(data_item, _index, _entity):
  407. for j in range(_index,_index+len(_entity)):
  408. if j==_index:
  409. data_item[j][1] = "B_"+_type
  410. elif j==_index+len(_entity)-1:
  411. data_item[j][1] = "E_"+_type
  412. else:
  413. data_item[j][1] = "M_"+_type
  414. if _find_flag:
  415. list_text_label.append(data_item)
  416. list_insert = ["根据","就",",",",","。",":",":"]
  417. for insert_item in list_insert:
  418. if np.random.random()>0.5:
  419. copy_data_item = copy.copy(data_item)
  420. list_index = []
  421. for i in range(len(copy_data_item)):
  422. _split = copy_data_item[i][1].split("_")
  423. if len(_split)==2:
  424. if _split[0]=="B":
  425. list_index.append(i)
  426. if _split[0]=="E":
  427. list_index.append(i+1)
  428. list_index.sort(key=lambda x:x,reverse=True)
  429. for _index in list_index:
  430. if np.random.random()>0.5:
  431. for j in range(len(insert_item)):
  432. copy_data_item.insert(_index+j,[insert_item[j],"O"])
  433. list_text_label.append(copy_data_item)
  434. list_insert = [" ","根据","就","受",",",",","。",":",":","#","&","$","、","/","-","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","RR","S","TA","U","V","Wa","X","YG","Z","a","b","c","d","e","f","g"]
  435. for insert_item in list_insert:
  436. if np.random.random()>0.7:
  437. copy_data_item = copy.copy(data_item)
  438. list_index = []
  439. for i in range(len(copy_data_item)):
  440. _split = copy_data_item[i][1].split("_")
  441. if len(_split)==2:
  442. if _split[0]=="B":
  443. list_index.append(i)
  444. if _split[0]=="E":
  445. list_index.append(i+1)
  446. list_index.sort(key=lambda x:x,reverse=True)
  447. for _index in list_index:
  448. if np.random.random()>0.5:
  449. for j in range(len(insert_item)):
  450. copy_data_item.insert(_index+j,[insert_item[j],"O"])
  451. if np.random.random()>0.5:
  452. break
  453. list_text_label.append(copy_data_item)
  454. ''''''
  455. list_entity.sort(key=lambda x:len(x[0]),reverse=True)
  456. for _sent in text.split("。"):
  457. _sent+= "。"
  458. _find_flag = False
  459. data_item = []
  460. for i in range(len(str(_sent))):
  461. _item = []
  462. _item.append(_sent[i])
  463. _item.append("O")
  464. data_item.append(_item)
  465. for _entity_type in list_entity:
  466. _entity = _entity_type[0]
  467. _type = _entity_type[1]
  468. if _type not in ["person","company","org","job","time","location"]:
  469. continue
  470. for _index in findAllIndex(_entity, _sent):
  471. _find_flag = True
  472. if len(_entity)==1:
  473. if hasNotBeenLabeled(data_item, _index, _entity):
  474. data_item[_index][1] = "S_"+_type
  475. else:
  476. if hasNotBeenLabeled(data_item, _index, _entity):
  477. for j in range(_index,_index+len(_entity)):
  478. if j==_index:
  479. data_item[j][1] = "B_"+_type
  480. elif j==_index+len(_entity)-1:
  481. data_item[j][1] = "E_"+_type
  482. else:
  483. data_item[j][1] = "M_"+_type
  484. #根据句子中是否包含实体来判断是否加入训练数据
  485. if _find_flag:
  486. list_sent_label.append(data_item)
  487. else:
  488. if np.random.random()>0.9:
  489. list_sent_label.append(data_item)
  490. ''' '''
  491. with codecs.open("ner_train.txt","w",encoding="utf8") as f:
  492. for _sent_label in list_sent_label:
  493. for _word,_label in _sent_label:
  494. f.write(_word+" "+_label+"\n")
  495. f.write("\n")
  496. f.flush()
  497. with codecs.open("ner_train_split.txt","w",encoding="utf8") as f:
  498. for _sent_label in list_text_label:
  499. for _word,_label in _sent_label:
  500. f.write(_word+" "+_label+"\n")
  501. f.write("\n")
  502. f.flush()
  503. def _load_map_file(path, char_map_name, id_map_name):
  504. with ZipFile(path) as myzip:
  505. with myzip.open('all_map.json') as myfile:
  506. content = myfile.readline()
  507. content = content.decode()
  508. data = json.loads(content)
  509. return data.get(char_map_name), data.get(id_map_name)
  510. def getContext(file):
  511. char_to_id, id_to_seg = _load_map_file("data/map.zip", "char_map", "ner_map")
  512. id_to_tag = {int(k):v for k,v in id_to_seg.items()}
  513. tag_to_id = {v:int(k) for k,v in id_to_seg.items()}
  514. list_sent_label = []
  515. with codecs.open(file,"r",encoding="utf8") as f:
  516. sentence = []
  517. while(True):
  518. line = f.readline()
  519. if not line:
  520. break
  521. if len(line)==1:
  522. if len(sentence)>0:
  523. list_sent_label.append(sentence)
  524. sentence = []
  525. else:
  526. _word_id = char_to_id.get(line[0]) if line[0] in char_to_id.keys() else char_to_id.get("<OOV>")
  527. _tag_id = tag_to_id.get(line.split()[-1].strip())
  528. sentence.append([_word_id,_tag_id])
  529. return list_sent_label
  530. def readlabeldata(file,list_context,MAX_LEN=300,keep_prob=1):
  531. '''
  532. @summary: 读取文件中的标注数据
  533. '''
  534. def addContext(_sentence,entity_sent,entity_label,id_B_company,id_E_company):
  535. _sent = []
  536. _label = []
  537. _flag = 0
  538. _find_flag = False
  539. for item in _sentence:
  540. if _flag==0:
  541. if item[1]==id_B_company:
  542. for word_id,tag_id in zip(entity_sent,entity_label):
  543. _sent.append(word_id)
  544. _label.append(tag_id)
  545. _flag = 1
  546. _find_flag = True
  547. else:
  548. _sent.append(item[0])
  549. _label.append(item[1])
  550. elif _flag==1:
  551. if item[1]==id_E_company:
  552. _flag = 2
  553. else:
  554. continue
  555. else:
  556. _sent.append(item[0])
  557. _label.append(item[1])
  558. return _sent,_label,_find_flag
  559. def spreadContext(_sent,_label,id_to_char,id_to_tag):
  560. list_sent_label = []
  561. for _word,_l in zip(_sent,_label):
  562. list_sent_label.append([id_to_char.get(_word),id_to_tag.get(_l)])
  563. print(list_sent_label)
  564. list_sent_label_lengths = []
  565. char_to_id, id_to_seg = _load_map_file("data/map.zip", "char_map", "ner_map")
  566. id_to_char = {int(v):k for k,v in char_to_id.items()}
  567. id_to_tag = {int(k):v for k,v in id_to_seg.items()}
  568. tag_to_id = {v:int(k) for k,v in id_to_seg.items()}
  569. id_B_company = tag_to_id.get("B_company")
  570. id_E_company = tag_to_id.get("E_company")
  571. with codecs.open(file,"r",encoding="utf8") as f:
  572. _sent = []
  573. _label = []
  574. while(True):
  575. line = f.readline()
  576. if not line:
  577. break
  578. if len(line)==1:
  579. if np.random.rand()<keep_prob:
  580. if len(_label)>0 and _label[0]==id_B_company and _label[-1]==id_E_company:
  581. if np.random.rand()<0.8:
  582. _int_random = np.random.randint(0,len(list_context))
  583. _sentence = list_context[_int_random]
  584. _sent_context,_label_context,_find_flag = addContext(_sentence, _sent, _label, id_B_company, id_E_company)
  585. if _find_flag:
  586. if len(_sent_context)<MAX_LEN:
  587. list_sent_label_lengths.append([_sent_context,_label_context,len(_sent_context)])
  588. else:
  589. if len(_sent)<MAX_LEN and len(_sent)>0:
  590. list_sent_label_lengths.append([_sent,_label,len(_sent)])
  591. '''
  592. print("====")
  593. spreadContext(_sent, _label, id_to_char, id_to_tag)
  594. spreadContext(_sent_context, _label_context, id_to_char, id_to_tag)
  595. print("====")
  596. '''
  597. else:
  598. if len(_sent)<MAX_LEN and len(_sent)>0:
  599. list_sent_label_lengths.append([_sent,_label,len(_sent)])
  600. _sent = []
  601. _label = []
  602. else:
  603. _sent.append(char_to_id.get(line[0]) if line[0] in char_to_id.keys() else char_to_id.get("<OOV>"))
  604. tag = line.split()[-1].strip()
  605. _label.append(tag_to_id.get(tag))
  606. return list_sent_label_lengths
  607. def gt3():
  608. '''
  609. @summary: 获取错误标签长度大于3的
  610. '''
  611. list_articles = []
  612. list_filename_html_wrongEntity = load("list_filename_html_wrongEntity.pk")
  613. for row in list_filename_html_wrongEntity:
  614. if len(row[2])>1 or len(row[2][0])>3:
  615. list_articles.append(row)
  616. print(len(list_articles))
  617. save(list_articles,"list_filename_html_wrongEntity_gt3.pk")
  618. def selectByRule():
  619. '''
  620. @summary: 从数据库中查询出符合规则的记录,方便修正
  621. '''
  622. conn = psycopg2.connect(dbname="article_label", user="postgres", password="postgres",host="192.168.2.101")
  623. cursor = conn.cursor()
  624. tables = ["fool_ner","fool_ner_train"]
  625. #tables = ["fool_ner"]
  626. for table in tables:
  627. sql = " select filename,type_0,text,new_type,new_text from "+table+" where ctid in (select max(ctid) from "+table+" where type_0 in ('org','company','location') group by filename,text) order by text "
  628. cursor.execute(sql)
  629. rows = cursor.fetchall()
  630. list_filename = []
  631. list_type_0 = []
  632. list_text = []
  633. list_new_type = []
  634. list_new_text = []
  635. pattern = "室"
  636. list_keyword = ["厂","所","出","院","部","行","局","社","采招办","酒店","办事处","分理处","管理处","集团","组织","支队","部队","支行","银行","支局","分行","分公司","公司","中心","医院","卫生院","小学","中学","大学","学校","政府","委员会","委会","财政局"]
  637. list_second_keyword = ["处","厅","园","委","队","室","站","会","办","馆","共和国","科技"]
  638. for row in rows:
  639. filename = row[0]
  640. type_0 = row[1]
  641. entity = row[2]
  642. new_type = row[3]
  643. new_entity = row[4]
  644. list_entity = []
  645. if new_type is None or new_type=="" or new_type=="nan":
  646. _type = type
  647. else:
  648. _type = new_type
  649. if new_entity=="1":
  650. continue
  651. elif new_entity is None or new_entity =="" or new_entity=="nan":
  652. list_entity.append([entity,_type,new_entity])
  653. else:
  654. for _entity in new_entity.split("##"):
  655. list_entity.append([_entity,_type,entity])
  656. _flag = False
  657. _index = 0
  658. for _entity in list_entity:
  659. '''
  660. if re.search('监狱.{,4}$',entity) is not None:
  661. _flag = True
  662. '''
  663. if (len(entity)>2 and entity[-1]==entity[-2]) or (len(entity)>4 and entity[-4:-2]==entity[-2:]):
  664. _flag = True
  665. '''
  666. pattern = "|".join(list_keyword)
  667. for _iter in re.finditer(pattern,text):
  668. if _iter.span()[1]>_index:
  669. _index = _iter.span()[1]
  670. new_text = text[:_index]
  671. if _index == 0:
  672. for _iter in re.finditer("|".join(list_second_keyword),text):
  673. if _iter.span()[1]>_index:
  674. _index = _iter.span()[1]
  675. new_text = text[:_index]
  676. '''
  677. '''
  678. for keyword in list_keyword:
  679. if _flag:
  680. break
  681. allindex = findAllIndex(keyword, text)
  682. if len(allindex)>0:
  683. _flag = True
  684. _index = allindex[-1]+len(keyword)
  685. new_text = text[:_index]
  686. '''
  687. if _flag:
  688. list_filename.append(filename)
  689. list_type_0.append(type_0)
  690. list_text.append(entity)
  691. list_new_type.append(new_type)
  692. list_new_text.append(new_entity)
  693. data = {"list_filename":list_filename,"list_type_0":list_type_0,"list_text":list_text,"list_new_type":list_new_type,"list_new_text":list_new_text}
  694. df = pd.DataFrame(data,columns=["list_filename","list_type_0","list_text","list_new_type","list_new_text"])
  695. df.to_excel(table+".xls")
  696. def makeDict_filename_content():
  697. dict_filename_content = {}
  698. path = "C:\\Users\\User\\Desktop\\fool语料\\*.html"
  699. set_doc_id = set()
  700. for file in glob.glob(path):
  701. filename = file.split("\\")[-1]
  702. doc_id = filename.split("_")[-1][:-5]
  703. text = codecs.open(file,"r",encoding="utf8").read()
  704. dict_filename_content[doc_id] = text
  705. list_path = ["C:\\Users\\User\\Desktop\\20190416要素\\*.html","C:\\Users\\User\\Desktop\\20190306要素\\*.html","C:\\Users\\User\\Desktop\\20190320要素\\*.html","C:\\Users\\User\\Desktop\\data_20190703\\*.html","C:\\Users\\User\\Desktop\\20190715\\*.html"]
  706. for path in list_path:
  707. for file in glob.glob(path):
  708. filename = file.split("\\")[-1]
  709. text = codecs.open(file,"r",encoding="utf8").read()
  710. dict_filename_content[filename] = text
  711. save(dict_filename_content,"dict_filename_content.pk")
  712. def importLabelData():
  713. conn = psycopg2.connect(dbname="article_label", user="postgres", password="postgres",host="192.168.2.101")
  714. cursor = conn.cursor()
  715. for file in glob.glob("label/*.xls"):
  716. if len(file.split("_"))>1:
  717. table = "fool_ner"
  718. else:
  719. table = "fool_ner_train"
  720. print(file,table)
  721. df = pd.read_excel(file)
  722. for filename,type_0,text,new_type,new_text in zip(df["list_filename"],df["list_type_0"],df["list_text"],df["list_new_type"],df["list_new_text"]):
  723. sql = " insert into "+table+" (filename,type_0,text,new_type,new_text) values('"+str(filename).replace(".0","")+"','"+str(type_0)+"','"+str(text)+"','"+str(new_type)+"','"+str(new_text)+"')"
  724. #sql = " update "+table+" set new_text='"+str(new_text)+"',new_type='"+str(new_type)+"' where filename='"+str(filename)+"' and text='"+str(text)+"' "
  725. cursor.execute(sql)
  726. conn.commit()
  727. conn.close()
  728. def checklabel():
  729. '''
  730. @summary: 检查label是否标注正确
  731. '''
  732. with codecs.open("ner_train.txt","r",encoding="utf8") as f:
  733. a = ""
  734. b = ""
  735. c = ""
  736. _index = 0
  737. while(True):
  738. _index += 1
  739. line = f.readline()
  740. if not line:
  741. break
  742. c = line.split(" ")[0].strip()
  743. if a=="新" and b=="乡" and c=="华":
  744. print(_index)
  745. a = b
  746. b = c
  747. def updateLabel():
  748. '''
  749. @summary: 更新标注数据
  750. '''
  751. conn = psycopg2.connect(dbname="article_label", user="postgres", password="postgres",host="192.168.2.101")
  752. cursor = conn.cursor()
  753. tables = ["fool_ner","fool_ner_train"]
  754. for table in tables:
  755. file = table+".xls"
  756. df = pd.read_excel(file)
  757. for filename,type_0,text,new_type,new_text in zip(df["list_filename"],df["list_type_0"],df["list_text"],df["list_new_type"],df["list_new_text"]):
  758. sql = " update "+table+" set new_type='"+str(new_type)+"',new_text='"+str(new_text)+"' where filename='"+str(filename).replace(".0","")+"' and text='"+str(text)+"'"
  759. print(sql)
  760. cursor.execute(sql)
  761. conn.commit()
  762. conn.close()
  763. def makeCertainEntity():
  764. fileList = ["C:\\Users\\User\\Desktop\\cleanedEntity.tsv","C:\\Users\\User\\Desktop\\company_found.tsv"]
  765. for file in fileList:
  766. outfile = file.split(".")[0]+".txt"
  767. with codecs.open(outfile,"w",encoding="utf8") as f_w:
  768. with codecs.open(file,"r",encoding="utf8") as f:
  769. while(True):
  770. line = f.readline().strip()
  771. if not line:
  772. break
  773. for i in range(len(line)):
  774. if i==0:
  775. f_w.write(line[i]+" B_company\n")
  776. elif i==len(line)-1:
  777. f_w.write(line[i]+" E_company\n")
  778. f_w.write("\n")
  779. else:
  780. f_w.write(line[i]+" M_company\n")
  781. def addContextToTheEntity(entity_file):
  782. def getContext(file):
  783. list_sent_label = []
  784. with codecs.open(file,"r",encoding="utf8") as f:
  785. sentence = []
  786. while(True):
  787. line = f.readline()
  788. if not line:
  789. break
  790. if len(line)==1:
  791. list_sent_label.append(sentence)
  792. sentence = []
  793. else:
  794. sentence.append([line[0],line.split()[-1].strip()])
  795. return list_sent_label
  796. list_sent_label = getContext("ner_label.txt")
  797. print("getContent done",len(list_sent_label))
  798. context_len = len(list_sent_label)
  799. outputfile = entity_file.split(".")[0]+"_addContext.txt"
  800. with codecs.open(outputfile,"w",encoding="utf8") as f_w:
  801. with codecs.open(entity_file,"r",encoding="utf8") as f_r:
  802. while(True):
  803. entity = f_r.readline().strip()
  804. random_int = np.random.randint(0,context_len)
  805. _sentence = list_sent_label[random_int]
  806. _flag = 0
  807. for item in _sentence:
  808. if _flag==0:
  809. if item[1]=="B_company":
  810. for word_index in range(len(entity)):
  811. if word_index==0:
  812. f_w.write(entity[word_index]+" B_company\n")
  813. elif word_index==len(entity)-1:
  814. f_w.write(entity[word_index]+" E_company\n")
  815. else:
  816. f_w.write(entity[word_index]+" M_company\n")
  817. _flag = 1
  818. else:
  819. f_w.write(item[0]+" "+item[1]+"\n")
  820. elif _flag==1:
  821. if item[1]=="E_company":
  822. _flag = 2
  823. else:
  824. continue
  825. else:
  826. f_w.write(item[0]+" "+item[1]+"\n")
  827. f_w.write("\n")
  828. def makeContext_by_fool_selffool():
  829. '''
  830. @summary: 通过fool和selffool的识别结果来判断一个句子的识别是否正确,若fool和selffool的识别一样,则为正确,否则待定
  831. '''
  832. import psycopg2
  833. conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
  834. list_filename_text = []
  835. cursor = conn.cursor()
  836. file_index = 0
  837. for file in glob.glob("C:\\Users\\User\\Desktop\\测试数据20190812\\*.html"):
  838. try:
  839. filename = file.split("\\")[-1]
  840. sql = " select count(1) from articles_processed_selffool where id='"+filename+"'"
  841. cursor.execute(sql)
  842. rows = cursor.fetchall()
  843. if rows[0][0]>0:
  844. continue
  845. content = codecs.open(file,"r",encoding="utf8").read()
  846. print(file_index,filename)
  847. text = Preprocessing.segment(Preprocessing.tableToText(BeautifulSoup(content,"lxml")))
  848. _article = Article(id=filename, content=text, sourceContent="", doc_id="", title="")
  849. persistArticle(conn,[_article],"articles_processed_selffool")
  850. list_sentences = []
  851. _sent_index = 0
  852. set_sentences = set()
  853. for x in re.split("[。]", text):
  854. if len(x)>0:
  855. if x in set_sentences:
  856. continue
  857. set_sentences.add(x)
  858. _sentence = Sentences(doc_id=filename,sentence_index=_sent_index,sentence_text=x+"。",tokens=[],pos_tags=[],ner_tags=[])
  859. list_sentences.append(_sentence)
  860. _ner_fool = fool.ner(_sentence.sentence_text)
  861. _ner_selffool = Preprocessing.selffool.ner(_sentence.sentence_text)
  862. if len(set(_ner_fool[0])&set(_ner_selffool[0]))==len(_ner_fool[0]):
  863. table_entity = "entity_mention_selffool"
  864. else:
  865. table_entity = "entity_mention_selffool_notsame"
  866. list_entitys = []
  867. for item in _ner_selffool[0]:
  868. _entity_id = filename+"_"+str(_sent_index)+"_"+str(item[0])+"_"+str(item[1])
  869. _entity = Entity(doc_id=filename,entity_id=_entity_id,entity_text=item[3],entity_type=item[2],sentence_index=_sent_index,begin_index=item[0],end_index=item[1])
  870. list_entitys.append(_entity)
  871. persistEntity(conn,list_entitys,table_entity)
  872. _sent_index += 1
  873. persistSentence(conn,list_sentences,"sentences_selffool")
  874. conn.commit()
  875. except Exception as e:
  876. print(e)
  877. conn.close()
  878. conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
  879. cursor = conn.cursor()
  880. finally:
  881. file_index += 1
  882. conn.close()
  883. def makeCompare():
  884. '''
  885. @summary: 通过比较fool的多个版本的selffool来判断置信度
  886. '''
  887. bilstm_new = BiLSTM()
  888. path_add = "new_model/"
  889. path = 'model/'+path_add+'model.ckpt'
  890. bilstm_new.restore(path)
  891. conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
  892. cursor = conn.cursor()
  893. sql = " select doc_id,sentence_index,sentence_text from sentences_selffool A where exists(select 1 from entity_mention_selffool_notsame B where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and fool_version='selffool') and not exists(select 1 from entity_mention_selffool_notsame B where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and fool_version='fool') "
  894. cursor.execute(sql)
  895. rows = cursor.fetchall()
  896. table_entity = "entity_mention_selffool_notsame"
  897. _index = 0
  898. try:
  899. for row in rows:
  900. _index += 1
  901. print(_index,len(rows))
  902. doc_id = row[0]
  903. sentence_index = row[1]
  904. '''
  905. sql = " select count(1) from "+table_entity+" where doc_id='"+doc_id+"' and sentence_index="+str(sentence_index)+" and fool_version='fool' "
  906. cursor.execute(sql)
  907. count_rows = cursor.fetchall()
  908. if count_rows[0][0]>0:
  909. continue
  910. '''
  911. text = row[2]
  912. _ner_entity_fool = set()
  913. _ner_entity_selffool = set()
  914. _ner_fool = fool.ner(text)[0]
  915. _ner_selffool = bilstm_new.ner(text)[0]
  916. list_entitys = []
  917. for item in _ner_fool:
  918. _entity_id = doc_id+"_"+str(sentence_index)+"_"+str(item[0])+"_"+str(item[1])
  919. _entity = Entity(doc_id=doc_id,entity_id=_entity_id,entity_text=item[3],entity_type=item[2],sentence_index=sentence_index,begin_index=item[0],end_index=item[1])
  920. list_entitys.append(_entity)
  921. persistEntity(conn,list_entitys,table_entity)
  922. conn.commit()
  923. '''
  924. for item in _ner_fool:
  925. if item[2] in ["org","company"]:
  926. _ner_entity_fool.add(item)
  927. for item in _ner_selffool:
  928. if item[2] in ["org","company"]:
  929. _ner_entity_selffool.add(item)
  930. if len(_ner_entity_fool&_ner_entity_selffool)==len(_ner_entity_fool) and len(_ner_entity_fool)==len(_ner_entity_selffool):
  931. print(text)
  932. print(_ner_selffool)
  933. '''
  934. except Exception as e:
  935. print(e)
  936. conn.close()
  937. conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
  938. cursor = conn.cursor()
  939. conn.close()
  940. def cluster_difference():
  941. '''
  942. @summary: 对截断的尾部进行聚类
  943. '''
  944. conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
  945. cursor = conn.cursor()
  946. sql = " select entity_id,doc_id,sentence_index,begin_index,end_index,entity_type,entity_text,fool_version from entity_mention_selffool_notsame where entity_type in ('org','company') order by entity_id "
  947. cursor.execute(sql)
  948. rows = cursor.fetchall()
  949. row_begin = 0
  950. DIFF_LEN = 2
  951. dict_diff_list = dict()
  952. while(row_begin<len(rows)-1):
  953. print(row_begin,len(rows))
  954. doc_id = rows[row_begin][1]
  955. sentence_index = rows[row_begin][2]
  956. row_end = row_begin
  957. for _i in range(row_begin+1,len(rows)):
  958. row_end = _i
  959. if rows[_i][1]==doc_id and rows[_i][2]==sentence_index:
  960. continue
  961. else:
  962. break
  963. list_entitys_fool = []
  964. list_entitys_selffool = []
  965. #对同一篇文章同一个句子的实体进行分类
  966. for _row in rows[row_begin:row_end]:
  967. entity_id = _row[0]
  968. begin_index = _row[3]
  969. end_index = _row[4]-1
  970. entity_type = _row[5]
  971. entity_text = _row[6]
  972. fool_version = _row[7]
  973. if entity_type in ["org","company"]:
  974. _entity = [entity_id,begin_index,end_index,entity_text,entity_type]
  975. if fool_version=="fool":
  976. list_entitys_fool.append(_entity)
  977. else:
  978. list_entitys_selffool.append(_entity)
  979. row_begin = row_end
  980. #分别遍历fool和selffool的结果,对不同点进行聚类
  981. list_key_entityid = []
  982. for _entity_fool in list_entitys_fool:
  983. entity_id_src = _entity_fool[0]
  984. begin_index_src = _entity_fool[1]
  985. end_index_src = _entity_fool[2]
  986. entity_text_src = _entity_fool[3]
  987. entity_type_src = _entity_fool[4]
  988. for _entity_selffool in list_entitys_selffool:
  989. entity_id_des = _entity_selffool[0]
  990. begin_index_des = _entity_selffool[1]
  991. end_index_des = _entity_selffool[2]
  992. entity_text_des = _entity_selffool[3]
  993. entity_type_des = _entity_selffool[4]
  994. if min(end_index_src,end_index_des)>max(begin_index_des,begin_index_src):
  995. if begin_index_src==begin_index_des:
  996. _key_begin = "SAME"
  997. else:
  998. _key_begin = entity_text_src[0:min(DIFF_LEN,len(entity_text_src))]+"#"+entity_text_des[0:min(DIFF_LEN,len(entity_text_des))]
  999. if end_index_src==end_index_des:
  1000. _key_end = "SAME"
  1001. else:
  1002. _key_end = entity_text_src[-min(DIFF_LEN,len(entity_text_src)):]+"#"+entity_text_des[-min(DIFF_LEN,len(entity_text_des)):]
  1003. _key = _key_begin+"|"+_key_end
  1004. list_key_entityid.append([_key,[entity_id_src,entity_id_des]])
  1005. #查看是否独有
  1006. for _entity_fool in list_entitys_fool:
  1007. entity_id_src = _entity_fool[0]
  1008. begin_index_src = _entity_fool[1]
  1009. end_index_src = _entity_fool[2]
  1010. entity_text_src = _entity_fool[3]
  1011. entity_type_src = _entity_fool[4]
  1012. find_flag = False
  1013. for item in list_key_entityid:
  1014. if entity_id_src in item[1]:
  1015. find_flag = True
  1016. if not find_flag:
  1017. _key = "fool|"+entity_text_src[-min(DIFF_LEN,len(entity_text_src)):]
  1018. list_key_entityid.append([_key,[entity_id_src]])
  1019. for _entity_fool in list_entitys_selffool:
  1020. entity_id_src = _entity_fool[0]
  1021. begin_index_src = _entity_fool[1]
  1022. end_index_src = _entity_fool[2]
  1023. entity_text_src = _entity_fool[3]
  1024. entity_type_src = _entity_fool[4]
  1025. find_flag = False
  1026. for item in list_key_entityid:
  1027. if entity_id_src in item[1]:
  1028. find_flag = True
  1029. if not find_flag:
  1030. _key = "selffool|"+entity_text_src[-min(DIFF_LEN,len(entity_text_src)):]
  1031. list_key_entityid.append([_key,[entity_id_src]])
  1032. #聚类
  1033. for item in list_key_entityid:
  1034. find_flag = False
  1035. if item[0]=="SAME|SAME":
  1036. continue
  1037. for _key in dict_diff_list.keys():
  1038. if item[0]==_key:
  1039. dict_diff_list[_key].append(item[1])
  1040. find_flag = True
  1041. if not find_flag:
  1042. dict_diff_list[item[0]] = [item[1]]
  1043. print(len(dict_diff_list.keys()))
  1044. list_key_count = []
  1045. for _key in dict_diff_list.keys():
  1046. list_key_count.append([_key,len(dict_diff_list[_key])])
  1047. list_key_count.sort(key=lambda x:x[1],reverse=True)
  1048. with codecs.open("diff_key_count.txt","w",encoding="utf8") as f:
  1049. for item in list_key_count:
  1050. f.write(item[0]+"\t"+str(item[1])+"\n")
  1051. save(dict_diff_list,"dict_diff_list.pk")
  1052. dict_sentence = None
  1053. def get_sentence(doc_id,sentence_index):
  1054. global dict_sentence
  1055. file_dict_sentence = "dict_sentence.pk"
  1056. if dict_sentence is None:
  1057. if os.path.exists(file_dict_sentence):
  1058. dict_sentence = load(file_dict_sentence)
  1059. else:
  1060. conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
  1061. cursor = conn.cursor()
  1062. sql = " select doc_id,sentence_index,sentence_text from sentences_selffool "
  1063. cursor.execute(sql)
  1064. dict_sentence = dict()
  1065. rows = cursor.fetchall()
  1066. for row in rows:
  1067. _doc_id = row[0]
  1068. _sentence_index = row[1]
  1069. _sentence_text = row[2]
  1070. _key = _doc_id+str(_sentence_index)
  1071. dict_sentence[_key] = _sentence_text
  1072. save(dict_sentence,file_dict_sentence)
  1073. _key = doc_id+str(sentence_index)
  1074. if _key in dict_sentence.keys():
  1075. return dict_sentence[_key]
  1076. return None
  1077. dict_diff_list = None
  1078. def viewEntityByKey():
  1079. global dict_diff_list
  1080. if dict_diff_list is None:
  1081. dict_diff_list = load("dict_diff_list.pk")
  1082. CONTEXT_LEN = 15
  1083. for key in dict_diff_list.keys():
  1084. diff_list = dict_diff_list[key]
  1085. file = "cluster_view/"+re.sub("[\*\|\/\r\n:]","",key.strip())+".xls"
  1086. if os.path.exists(file):
  1087. continue
  1088. list_entityid = []
  1089. list_before = []
  1090. list_center = []
  1091. list_after = []
  1092. list_change = []
  1093. list_type = []
  1094. list_version = []
  1095. if len(diff_list[0])==2:
  1096. for item in diff_list:
  1097. for i in range(len(item)):
  1098. if i==0:
  1099. list_version.append("fool")
  1100. else:
  1101. list_version.append("selffool")
  1102. entityid = item[i]
  1103. split_entityid = entityid.split("html")[1].split("_")
  1104. doc_id = entityid.split("html")[0]+"html"
  1105. sentence_index = split_entityid[1]
  1106. sentence_text = get_sentence(doc_id, sentence_index)
  1107. begin_index = int(split_entityid[2])
  1108. end_index = int(split_entityid[3])-1
  1109. list_entityid.append(entityid)
  1110. before = sentence_text[max(0,begin_index-CONTEXT_LEN):begin_index]
  1111. center = sentence_text[begin_index:end_index]
  1112. after = sentence_text[end_index:min(end_index+CONTEXT_LEN,len(sentence_text))]
  1113. list_before.append(before)
  1114. list_center.append(center)
  1115. list_after.append(after)
  1116. list_change.append(center)
  1117. list_type.append("")
  1118. else:
  1119. version = key.split("|")[0]
  1120. for item in diff_list:
  1121. list_version.append(version)
  1122. entityid = item[0]
  1123. split_entityid = entityid.split("html")[1].split("_")
  1124. doc_id = entityid.split("html")[0]+"html"
  1125. sentence_index = split_entityid[1]
  1126. sentence_text = get_sentence(doc_id, sentence_index)
  1127. begin_index = int(split_entityid[2])
  1128. end_index = int(split_entityid[3])-1
  1129. list_entityid.append(entityid)
  1130. before = sentence_text[max(0,begin_index-CONTEXT_LEN):begin_index]
  1131. center = sentence_text[begin_index:end_index]
  1132. after = sentence_text[end_index:min(end_index+CONTEXT_LEN,len(sentence_text))]
  1133. list_before.append(before)
  1134. list_center.append(center)
  1135. list_after.append(after)
  1136. list_change.append(center)
  1137. list_type.append("")
  1138. data = {"list_entityid":list_entityid,
  1139. "list_before":list_before,
  1140. "list_center":list_center,
  1141. "list_after":list_after,
  1142. "list_change":list_change,
  1143. "list_type":list_type,
  1144. "list_version":list_version}
  1145. df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
  1146. df.to_excel(file)
  1147. def alterFileByRule(file):
  1148. df = pd.read_excel(file)
  1149. _location = "location"
  1150. for _index in range(len(df["list_entityid"])):
  1151. version = df["list_version"][_index]
  1152. if version=="selffool":
  1153. ''''''
  1154. df["list_change"][_index] = df["list_change"][_index-1]
  1155. df.to_excel(file,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
  1156. def getCompanyByTianyan():
  1157. token = "b775e2ed-d919-4d5f-8ab1-406d82d6bb56"
  1158. headers = {"Authorization":token}
  1159. url = "http://open.api.tianyancha.com/services/v4/open/searchV2?word="
  1160. _inTianYan = "inTianYan"
  1161. _inSource = "inSource"
  1162. _dict = load("dict_company.pk")
  1163. count = 0
  1164. for entity in load("set_company.pk"):
  1165. try:
  1166. count += 1
  1167. print(count,len(_dict.keys()))
  1168. if entity in _dict:
  1169. _dict[entity][_inSource] = True
  1170. if _dict[entity][_inTianYan]:
  1171. continue
  1172. else:
  1173. _dict[entity] = {_inTianYan:False,_inSource:True}
  1174. r = requests.get(url+entity,headers=headers)
  1175. r_json = r.json()
  1176. if r_json["error_code"]==0:
  1177. for item in r_json["result"]["items"]:
  1178. companyName = re.sub("</?em>","",item["name"]).replace("(","(").replace(")",")")
  1179. if companyName in _dict:
  1180. _dict[companyName][_inTianYan] = True
  1181. else:
  1182. _dict[companyName] = {_inTianYan:True,_inSource:False}
  1183. elif r_json["error_code"]==300007:
  1184. print("剩余次数不足")
  1185. break
  1186. except Exception as e:
  1187. print(str(e))
  1188. save(_dict,"dict_company.pk")
  1189. def labelByTianyan():
  1190. '''
  1191. @summary: 通过天眼查的数据接口来获取标注
  1192. '''
  1193. list_entityid = []
  1194. list_before = []
  1195. list_center = []
  1196. list_after = []
  1197. list_change = []
  1198. list_type = []
  1199. list_version = []
  1200. list_entityid_notmatch = []
  1201. list_before_notmatch = []
  1202. list_center_notmatch = []
  1203. list_after_notmatch = []
  1204. list_change_notmatch = []
  1205. list_type_notmatch = []
  1206. list_version_notmatch = []
  1207. _inTianYan = "inTianYan"
  1208. _inSource = "inSource"
  1209. _dict_company = load("dict_company.pk")
  1210. is_compare = False
  1211. for file in glob.glob("cluster_view/add/*.xls"):
  1212. df = pd.read_excel(file)
  1213. for _index in range(len(df["list_change"])):
  1214. version = df["list_version"][_index]
  1215. if version in ["selffool","fool"]:
  1216. _match_count = 0
  1217. true_entity = None
  1218. if df["list_change"][_index] in _dict_company:
  1219. if _dict_company[df["list_change"][_index]][_inTianYan]:
  1220. _match_count += 1
  1221. true_entity = df["list_change"][_index]
  1222. if is_compare:
  1223. if df["list_change"][_index-1] in _dict_company:
  1224. if _dict_company[df["list_change"][_index-1]][_inTianYan]:
  1225. _match_count += 1
  1226. true_entity = df["list_change"][_index-1]
  1227. if _match_count==1:
  1228. if is_compare:
  1229. list_entityid.append(df["list_entityid"][_index-1])
  1230. list_before.append(df["list_before"][_index-1])
  1231. list_center.append(df["list_center"][_index-1])
  1232. list_after.append(df["list_after"][_index-1])
  1233. list_change.append(df["list_change"][_index-1])
  1234. list_type.append(df["list_type"][_index-1])
  1235. list_version.append(df["list_version"][_index-1])
  1236. list_entityid.append(df["list_entityid"][_index])
  1237. list_before.append(df["list_before"][_index])
  1238. list_center.append(df["list_center"][_index])
  1239. list_after.append(df["list_after"][_index])
  1240. list_change.append(true_entity)
  1241. list_type.append(df["list_type"][_index])
  1242. list_version.append(df["list_version"][_index])
  1243. else:
  1244. if is_compare:
  1245. list_entityid_notmatch.append(df["list_entityid"][_index-1])
  1246. list_before_notmatch.append(df["list_before"][_index-1])
  1247. list_center_notmatch.append(df["list_center"][_index-1])
  1248. list_after_notmatch.append(df["list_after"][_index-1])
  1249. list_change_notmatch.append(df["list_change"][_index-1])
  1250. list_type_notmatch.append(df["list_type"][_index-1])
  1251. list_version_notmatch.append(df["list_version"][_index-1])
  1252. list_entityid_notmatch.append(df["list_entityid"][_index])
  1253. list_before_notmatch.append(df["list_before"][_index])
  1254. list_center_notmatch.append(df["list_center"][_index])
  1255. list_after_notmatch.append(df["list_after"][_index])
  1256. list_change_notmatch.append(df["list_change"][_index])
  1257. list_type_notmatch.append(df["list_type"][_index])
  1258. list_version_notmatch.append(df["list_version"][_index])
  1259. data = {"list_entityid":list_entityid,
  1260. "list_before":list_before,
  1261. "list_center":list_center,
  1262. "list_after":list_after,
  1263. "list_change":list_change,
  1264. "list_type":list_type,
  1265. "list_version":list_version}
  1266. df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
  1267. df.to_excel("cluster_view/add_match.xls")
  1268. nums = 50000
  1269. _begin = 0
  1270. while(_begin<len(list_entityid_notmatch)):
  1271. data = {"list_entityid":list_entityid_notmatch[_begin:_begin+nums],
  1272. "list_before":list_before_notmatch[_begin:_begin+nums],
  1273. "list_center":list_center_notmatch[_begin:_begin+nums],
  1274. "list_after":list_after_notmatch[_begin:_begin+nums],
  1275. "list_change":list_change_notmatch[_begin:_begin+nums],
  1276. "list_type":list_type_notmatch[_begin:_begin+nums],
  1277. "list_version":list_version_notmatch[_begin:_begin+nums]}
  1278. df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
  1279. df.to_excel("cluster_view/add_notmatch_"+str(_begin)+".xls")
  1280. _begin += nums
  1281. def cluster_entitys():
  1282. '''
  1283. @summary: 对实体进行聚类,统一截断
  1284. '''
  1285. tail_pattern = re.compile("学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
  1286. dict_tail_entitys = {}
  1287. listfile = ["cluster_view/*_match.xls","cluster_view/done/*.xls","cluster_view/tofix/done/*.xls"]
  1288. count = 0
  1289. for globfile in listfile:
  1290. for file in glob.glob(globfile):
  1291. isadd = re.search("fool|add",file) is not None
  1292. count += 1
  1293. print(count,file)
  1294. df = pd.read_excel(file)
  1295. list_entityid = df["list_entityid"]
  1296. list_before = df["list_before"]
  1297. list_center = df["list_center"]
  1298. list_after = df["list_after"]
  1299. list_change = df["list_change"]
  1300. list_type = df["list_type"]
  1301. list_version = df["list_version"]
  1302. for _index in range(len(list_entityid)):
  1303. '''
  1304. #跳过标注为1的
  1305. if str(list_change[_index])=="1":
  1306. continue
  1307. '''
  1308. #跳过用于对比标注的fool数据
  1309. if not isadd and list_version[_index]=="fool":
  1310. continue
  1311. if str(list_change[_index])=="1":
  1312. _key = "-1-"
  1313. else:
  1314. _find = re.findall(tail_pattern,str(list_change[_index]))
  1315. if len(_find)==0:
  1316. _key = "other"
  1317. else:
  1318. _key = "-".join(_find)
  1319. if _key in dict_tail_entitys:
  1320. dict_tail_entitys[_key]["list_entityid"].append(list_entityid[_index])
  1321. dict_tail_entitys[_key]["list_before"].append(list_before[_index])
  1322. dict_tail_entitys[_key]["list_center"].append(list_center[_index])
  1323. dict_tail_entitys[_key]["list_after"].append(list_after[_index])
  1324. dict_tail_entitys[_key]["list_change"].append(list_change[_index])
  1325. dict_tail_entitys[_key]["list_type"].append(list_type[_index])
  1326. dict_tail_entitys[_key]["list_version"].append(list_version[_index])
  1327. else:
  1328. dict_tail_entitys[_key] = {"list_entityid":[list_entityid[_index]],
  1329. "list_before":[list_before[_index]],
  1330. "list_center":[list_center[_index]],
  1331. "list_after":[list_after[_index]],
  1332. "list_change":[list_change[_index]],
  1333. "list_type":[list_type[_index]],
  1334. "list_version":[list_version[_index]]}
  1335. print(len(dict_tail_entitys.keys()))
  1336. for _key in dict_tail_entitys.keys():
  1337. filename = "cluster_view/cluster/"+_key+".xls"
  1338. nums = 50000
  1339. _begin = 0
  1340. if os.path.exists(filename):
  1341. continue
  1342. while(_begin*nums<len(dict_tail_entitys[_key]["list_entityid"])):
  1343. data = {"list_entityid":dict_tail_entitys[_key]["list_entityid"][_begin*nums:(_begin+1)*nums],
  1344. "list_before":dict_tail_entitys[_key]["list_before"][_begin*nums:(_begin+1)*nums],
  1345. "list_center":dict_tail_entitys[_key]["list_center"][_begin*nums:(_begin+1)*nums],
  1346. "list_after":dict_tail_entitys[_key]["list_after"][_begin*nums:(_begin+1)*nums],
  1347. "list_change":dict_tail_entitys[_key]["list_change"][_begin*nums:(_begin+1)*nums],
  1348. "list_type":dict_tail_entitys[_key]["list_type"][_begin*nums:(_begin+1)*nums],
  1349. "list_version":dict_tail_entitys[_key]["list_version"][_begin*nums:(_begin+1)*nums]}
  1350. df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
  1351. df.to_excel("cluster_view/cluster/"+_key+"-"+str(_begin)+".xls")
  1352. _begin += 1
  1353. def tofix():
  1354. '''
  1355. @summary: 获取所有待标注数据,使用规则过滤掉一些,再均分
  1356. '''
  1357. conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
  1358. cursor = conn.cursor()
  1359. sql = " select entity_id,fool_version,entity_text from entity_mention_selffool_notsame"
  1360. cursor.execute(sql)
  1361. dict_idversion_entity = {}
  1362. rows = cursor.fetchall()
  1363. for row in rows:
  1364. entity_id = row[0]
  1365. fool_version = row[1]
  1366. entity_text = row[2]
  1367. _key = entity_id+"-"+fool_version
  1368. dict_idversion_entity[_key] = entity_text
  1369. list_entityid_tofix = []
  1370. list_before_tofix = []
  1371. list_center_tofix = []
  1372. list_after_tofix = []
  1373. list_change_tofix = []
  1374. list_type_tofix = []
  1375. list_version_tofix = []
  1376. list_entityid_nottofix = []
  1377. list_before_nottofix = []
  1378. list_center_nottofix = []
  1379. list_after_nottofix = []
  1380. list_change_nottofix = []
  1381. list_type_nottofix = []
  1382. list_version_nottofix = []
  1383. nottofix_pattern = "^[A-Za-z0-9\-]*[省市区县州镇]|^[A-Za-z0-9\-]+$"
  1384. listfile = ["cluster_view/*notmatch*.xls"]
  1385. count = 0
  1386. data = []
  1387. for globfile in listfile:
  1388. for file in glob.glob(globfile):
  1389. isadd = re.search("fool|add",file) is not None
  1390. count += 1
  1391. print(count)
  1392. df = pd.read_excel(file)
  1393. list_entityid = df["list_entityid"]
  1394. list_before = df["list_before"]
  1395. list_center = df["list_center"]
  1396. list_after = df["list_after"]
  1397. list_change = df["list_change"]
  1398. list_type = df["list_type"]
  1399. list_version = df["list_version"]
  1400. for _index in range(len(list_entityid)):
  1401. if not isadd and list_version[_index]=="fool":
  1402. continue
  1403. _key = str(list_entityid[_index])+"-"+str(list_version[_index])
  1404. if _key in dict_idversion_entity and dict_idversion_entity[_key]!=list_center[_index]:
  1405. list_center[_index] = dict_idversion_entity[_key]
  1406. list_change[_index] = dict_idversion_entity[_key]
  1407. data.append([str(list_entityid[_index]),str(list_before[_index]),str(list_center[_index]),str(list_after[_index]),str(list_change[_index]),str(list_type[_index]),str(list_version[_index])])
  1408. data.sort(key=lambda x:x[4])
  1409. for item in data:
  1410. entityid = item[0]
  1411. before = item[1]
  1412. center = item[2]
  1413. after = item[3]
  1414. change = item[4]
  1415. type = item[5]
  1416. version = item[6]
  1417. if re.search(nottofix_pattern,change) is not None:
  1418. list_entityid_nottofix.append(entityid)
  1419. list_before_nottofix.append(before)
  1420. list_center_nottofix.append(center)
  1421. list_after_nottofix.append(after)
  1422. list_change_nottofix.append(change)
  1423. list_type_nottofix.append(type)
  1424. list_version_nottofix.append(version)
  1425. else:
  1426. list_entityid_tofix.append(entityid)
  1427. list_before_tofix.append(before)
  1428. list_center_tofix.append(center)
  1429. list_after_tofix.append(after)
  1430. list_change_tofix.append(change)
  1431. list_type_tofix.append(type)
  1432. list_version_tofix.append(version)
  1433. parts = 16
  1434. nums = len(list_entityid_tofix)//parts
  1435. _begin = 0
  1436. while(_begin*nums<len(list_entityid_tofix)):
  1437. data = {"list_entityid":list_entityid_tofix[_begin*nums:(_begin+1)*nums],
  1438. "list_before":list_before_tofix[_begin*nums:(_begin+1)*nums],
  1439. "list_center":list_center_tofix[_begin*nums:(_begin+1)*nums],
  1440. "list_after":list_after_tofix[_begin*nums:(_begin+1)*nums],
  1441. "list_change":list_change_tofix[_begin*nums:(_begin+1)*nums],
  1442. "list_type":list_type_tofix[_begin*nums:(_begin+1)*nums],
  1443. "list_version":list_version_tofix[_begin*nums:(_begin+1)*nums]}
  1444. df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
  1445. df.to_excel("cluster_view/tofix/"+str(_begin)+".xls")
  1446. _begin += 1
  1447. data = {"list_entityid":list_entityid_nottofix,
  1448. "list_before":list_before_nottofix,
  1449. "list_center":list_center_nottofix,
  1450. "list_after":list_after_nottofix,
  1451. "list_change":list_change_nottofix,
  1452. "list_type":list_type_nottofix,
  1453. "list_version":list_version_nottofix}
  1454. df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
  1455. df.to_excel("cluster_view/tofix/nottofix.xls")
  1456. def updateEntityview():
  1457. '''
  1458. @summary: 将视图中的数据更新回去,以selffool的为准进行更新操作,若是遇到fool新增的,则进行插入操作,fool_version改为selffool_add
  1459. '''
  1460. conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
  1461. cursor = conn.cursor()
  1462. listfile = ["cluster_view/cluster/*.xls"]
  1463. count = 0
  1464. for globfile in listfile:
  1465. for file in glob.glob(globfile):
  1466. count += 1
  1467. print(count,file)
  1468. df = pd.read_excel(file)
  1469. for _index in range(len(df["list_entityid"])):
  1470. entity_id = df["list_entityid"][_index]
  1471. doc_id = entity_id.split("html")[0]+"html"
  1472. list_index = entity_id.split("html")[1].split("_")
  1473. sentence_index = list_index[1]
  1474. begin_index = list_index[2]
  1475. end_index = list_index[3]
  1476. change = str(df["list_change"][_index])
  1477. type = str(df["list_type"][_index])
  1478. version = str(df["list_version"][_index])
  1479. if version=="fool":
  1480. sql = " update entity_mention_selffool_notsame set new_text='"+str(change)+"',new_type='"+type+"',fool_version='fool_add' where entity_id='"+entity_id+"' and fool_version='fool' "
  1481. cursor.execute(sql)
  1482. else:
  1483. sql = " update entity_mention_selffool_notsame set new_text='"+str(change)+"',new_type='"+type+"' where entity_id='"+entity_id+"' and fool_version='selffool' "
  1484. cursor.execute(sql)
  1485. conn.commit()
  1486. conn.close()
  1487. def makeLabelText():
  1488. '''
  1489. @summary: 更新数据太慢,直接从数据库查询出数据库进行替换,然后生成训练数据
  1490. '''
  1491. conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
  1492. cursor = conn.cursor()
  1493. #根据聚类的结果生成替换字典
  1494. dict_replace = dict()
  1495. listfile = ["cluster_view/cluster/*.xls"]
  1496. count = 0
  1497. for globfile in listfile:
  1498. for file in glob.glob(globfile):
  1499. count += 1
  1500. print(count,file)
  1501. df = pd.read_excel(file)
  1502. for _index in range(len(df["list_entityid"])):
  1503. entity_id = df["list_entityid"][_index]
  1504. doc_id = entity_id.split("html")[0]+"html"
  1505. list_index = entity_id.split("html")[1].split("_")
  1506. sentence_index = list_index[1]
  1507. begin_index = list_index[2]
  1508. end_index = list_index[3]
  1509. change = str(df["list_change"][_index])
  1510. type = str(df["list_type"][_index])
  1511. version = str(df["list_version"][_index])
  1512. _key = entity_id+version
  1513. dict_replace[_key] = [change,type]
  1514. print("get dict_replace done")
  1515. #从数据库中查询出实体,按照entity_id排序
  1516. data_entity = []
  1517. sql = " select entity_id,doc_id,sentence_index,entity_text,entity_type,'selffool' from entity_mention_selffool order by entity_id "
  1518. cursor.execute(sql)
  1519. rows = cursor.fetchall()
  1520. data_entity = data_entity+rows
  1521. sql = " select entity_id,doc_id,sentence_index,entity_text,entity_type,fool_version from entity_mention_selffool_notsame order by entity_id "
  1522. cursor.execute(sql)
  1523. rows = cursor.fetchall()
  1524. data_entity = data_entity+rows
  1525. #生成doc_id-sentence_index-list_entity_type字典
  1526. dict_sent_entitys = dict()
  1527. _begin = 0
  1528. while(_begin<len(data_entity)-1):
  1529. _begin_doc_id = data_entity[_begin][1]
  1530. _begin_sentence_index = data_entity[_begin][2]
  1531. _end = _begin
  1532. print(_begin)
  1533. for end in range(_begin+1,len(data_entity)):
  1534. _end = end
  1535. _end_doc_id = data_entity[end][1]
  1536. _end_sentence_index = data_entity[end][2]
  1537. if _begin_doc_id==_end_doc_id and _begin_sentence_index==_end_sentence_index:
  1538. continue
  1539. else:
  1540. break
  1541. for item in data_entity[_begin:_end]:
  1542. entity_id = item[0]
  1543. doc_id = item[1]
  1544. sentence_index = item[2]
  1545. entity_text = item[3]
  1546. entity_type = item[4]
  1547. version = item[5]
  1548. _key = doc_id+"-"+str(sentence_index)
  1549. if _key not in dict_sent_entitys:
  1550. dict_sent_entitys[_key] = []
  1551. #进行替换
  1552. if entity_type in ["org","company"]:
  1553. _key1 = entity_id+version
  1554. if _key1 in dict_replace:
  1555. if str(dict_replace[_key1][0])=="1":
  1556. continue
  1557. if dict_replace[_key1][1] in ["org","company","person","location"]:
  1558. for item in re.split("##",dict_replace[_key1][0]):
  1559. dict_sent_entitys[_key].append([item,dict_replace[_key1][1]])
  1560. else:
  1561. for item in re.split("##",dict_replace[_key1][0]):
  1562. dict_sent_entitys[_key].append([item,entity_type])
  1563. else:
  1564. if version=="selffool":
  1565. dict_sent_entitys[_key].append([entity_text,entity_type])
  1566. else:
  1567. dict_sent_entitys[_key].append([entity_text,entity_type])
  1568. _begin = _end
  1569. print("get dict_sent_entitys done")
  1570. #查询出句子
  1571. sql = " select doc_id,sentence_index,sentence_text from sentences_selffool order by doc_id "
  1572. cursor.execute(sql)
  1573. list_sentence = cursor.fetchall()
  1574. count = 0
  1575. with codecs.open("selffool_train.txt","w",encoding="utf8") as f:
  1576. for sent in list_sentence:
  1577. count += 1
  1578. print(count)
  1579. _key = sent[0]+"-"+str(sent[1])
  1580. sentence = sent[2]
  1581. if len(sentence)>2000:
  1582. continue
  1583. if _key in dict_sent_entitys:
  1584. data_item,_find_flag = makeLabel(sentence, dict_sent_entitys[_key])
  1585. for _word,_label in data_item:
  1586. f.write(_word+" "+_label+"\n")
  1587. else:
  1588. if np.random.random()>0.8:
  1589. data_item,_find_flag = makeLabel(sentence, [])
  1590. for _word,_label in data_item:
  1591. f.write(_word+" "+_label+"\n")
  1592. f.write("\n")
  1593. f.flush()
  1594. if __name__=="__main__":
  1595. #makeFoolTrainData()
  1596. #makeTrainTxt()
  1597. #labelEntity()
  1598. #readlabeldata("cleanedEntity.txt",getContext("ner_train.txt"))
  1599. #makeDict_filename_content()
  1600. #selectByRule()
  1601. #updateLabel()
  1602. #importLabelData()
  1603. #makeCertainEntity()
  1604. #addContextToTheEntity("company_found.tsv")
  1605. #makeContext_by_fool_selffool()
  1606. #makeCompare()
  1607. #cluster_difference()
  1608. #viewEntityByKey()
  1609. #alterFileByRule("cluster_view/change/SAME版社#大学.xls")
  1610. #getCompanyByTianyan()
  1611. '''
  1612. data = load("dict_company.pk")
  1613. for item in data.keys():
  1614. print(item,data[item])
  1615. '''
  1616. #labelByTianyan()
  1617. '''
  1618. token = "b775e2ed-d919-4d5f-8ab1-406d82d6bb56"
  1619. headers = {"Authorization":token}
  1620. url = "http://open.api.tianyancha.com/services/v4/open/searchV2?word="
  1621. r = requests.get(url+"安阳鑫龙煤业(集团)龙山煤业有限责任公司",headers=headers)
  1622. r_json = r.json()
  1623. print(r_json)
  1624. '''
  1625. #tofix()
  1626. cluster_entitys()
  1627. makeLabelText()