self_preprocess.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. import datetime
  2. from collections import defaultdict
  3. from itertools import chain, groupby
  4. import logging
  5. import tempfile
  6. from iepy.preprocess.pipeline import BasePreProcessStepRunner, PreProcessSteps
  7. from iepy.preprocess.ner.base import FoundEntity
  8. from iepy.data.models import EntityOccurrence, GazetteItem
  9. from brat.models import BratAnnotation as brat_annotations
  10. from iepy.selfpreprocess.BiddingKG.dl.interface.Connection import *
  11. from iepy.selfpreprocess.BiddingKG.dl.common.Utils import *
  12. from iepy.selfpreprocess.BiddingKG.dl.interface.Connection import getConnection
  13. import iepy.selfpreprocess.BiddingKG.dl.interface.predictor as predictor
  14. import iepy.selfpreprocess.BiddingKG.dl.interface.Preprocessing as Preprocessing
  15. import iepy.selfpreprocess.BiddingKG.dl.interface.getAttributes as getAttributes
  16. import iepy.selfpreprocess.BiddingKG.dl.entityLink.entityLink as entityLink
  17. import json
  18. from iepy.selfpreprocess.pipeline import PreProcessSteps
  19. import iepy.selfpreprocess.BiddingKG.dl.complaint.punish_predictor as punish_rule
  20. from iepy.webui.brat.src import annotator
  21. logger = logging.getLogger(__name__)
  22. codeNamePredict = predictor.CodeNamePredict()
  23. premPredict = predictor.PREMPredict()
  24. epcPredict = predictor.EPCPredict()
  25. roleRulePredict = predictor.RoleRulePredictor()
  26. timePredict = predictor.TimePredictor()
  27. punish = punish_rule.Punish_Extract()
  28. productPredict = predictor.ProductPredictor()
  29. def predict(doc_id,text):
  30. log("process %s"%doc_id)
  31. list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
  32. codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
  33. print(codeName)
  34. premPredict.predict(list_sentences,list_entitys)
  35. productPredict.predict(list_sentences,list_entitys)
  36. roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
  37. print("epcPredict")
  38. epcPredict.predict(list_sentences,list_entitys)
  39. print("entityLink")
  40. timePredict.predict(list_sentences, list_entitys)
  41. print("timePredict")
  42. entityLink.link_entitys(list_entitys)
  43. print("getPREMs")
  44. prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
  45. # codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
  46. # productPredict.predict(list_sentences,list_entitys)
  47. # premPredict.predict(list_sentences,list_entitys)
  48. # roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
  49. # epcPredict.predict(list_sentences,list_entitys)
  50. # timePredict.predict(list_sentences, list_entitys)
  51. # entityLink.link_entitys(list_entitys)
  52. # prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
  53. #
  54. log("extract done %s"%(str(prem)))
  55. return list_articles,list_sentences,list_entitys
  56. dict_type = {"org":{"0":"org_tenderee",
  57. "1":"org_agency",
  58. "2":"org_tenderer",
  59. "3":"org_secondTenderer",
  60. "4":"org_thirdTenderer",
  61. "5":"org"},
  62. "company":{"0":"company_tenderee",
  63. "1":"company_agency",
  64. "2":"company_tenderer",
  65. "3":"company_secondTenderer",
  66. "4":"company_thirdTenderer",
  67. "5":"company"},
  68. "money":{"0":"money_tendereeMoney",
  69. "1":"money_tendererMoney",
  70. "2":"money"},
  71. "person":{"0":"person",
  72. "1":"person_tendereePerson",
  73. "2":"person_agencyPerson",
  74. "3":"person_person",
  75. "4":"person_review"},
  76. "time":{"0":"time",
  77. "1":"time_release",
  78. "2":"time_bidopen",
  79. "3":"time_bidclose"}}
  80. dict_role_attribute = {"0":"att_tenderee",
  81. "1":"att_agency",
  82. "2":"att_tenderer",
  83. "3":"att_secondTenderer",
  84. "4":"att_thirdTenderer",
  85. "5":"att_noRole"}
  86. dict_money_attribute = {"0":"att_tendereeMoney",
  87. "1":"att_tendererMoney",
  88. "2":"money"}
  89. dict_person_attribute = {"0":"att_noperson",
  90. "1":"att_tendereePerson",
  91. "2":"att_agencyPerson",
  92. "3":"att_person"}
  93. dict_relations = {"pointer_pack":"rel_pack",
  94. "pointer_money":"rel_tendererMoney",
  95. "pointer_person":"rel_person",
  96. "pointer_address":"rel_address",
  97. "pointer_tendereeMoney":"rel_tendereeMoney"}
  98. def getAttribute(_entity):
  99. attribute = {"role":None,"money":None,"person":None}
  100. if _entity.entity_type in ["org","company"]:
  101. attribute["role"] = dict_role_attribute[str(_entity.label)]
  102. if _entity.entity_type in ["money"]:
  103. attribute["money"] = dict_money_attribute[str(_entity.label)]
  104. if _entity.entity_type in ["person"]:
  105. attribute["person"] = dict_person_attribute[str(_entity.label)]
  106. list_popkeys = []
  107. for _key in attribute.keys():
  108. if attribute[_key] is None:
  109. list_popkeys.append(_key)
  110. for _key in list_popkeys:
  111. attribute.pop(_key)
  112. return attribute
  113. def getType(_entity):
  114. if _entity.entity_type in dict_type:
  115. if str(_entity.label) in dict_type[_entity.entity_type]:
  116. return dict_type[_entity.entity_type][str(_entity.label)]
  117. return _entity.entity_type
  118. class SelfAnalizer():
  119. def __init__(self,doc_id,sourceText):
  120. self.docid=doc_id
  121. list_articles,list_sentences,list_entitys = predict(doc_id,sourceText)
  122. self.article = list_articles[0]
  123. # print(self.article.content)
  124. self.sentences = list_sentences[0]
  125. self.entitys = list_entitys[0]
  126. self.dict_sentences = self.get_sentences()
  127. #删除原先的数据
  128. brat_annotations.objects.filter(document_id=doc_id).delete()
  129. def get_sentences(self):
  130. dict_sentences = dict()
  131. offset_word = 0
  132. offset_words = 0
  133. self.sentences.sort(key=lambda x:x.sentence_index)
  134. for sentence in self.sentences:
  135. # print(len(sentence.sentence_text),sentence.sentence_text)
  136. if sentence.sentence_index not in dict_sentences:
  137. dict_sentences[sentence.sentence_index] = {"object":sentence,"offset_word":[-1,-1],"offset_words":[-1,-1]}
  138. dict_sentences[sentence.sentence_index]["offset_word"] = [offset_word,offset_word+len(sentence.sentence_text)]
  139. dict_sentences[sentence.sentence_index]["offset_words"] = [offset_words,offset_words+len(sentence.tokens)]
  140. offset_word += len(sentence.sentence_text)
  141. offset_words += len(sentence.tokens)
  142. return dict_sentences
  143. def get_sentence_boundaries(self):
  144. """
  145. Returns a list with the offsets in tokens where each sentence starts, in
  146. order. The list contains one extra element at the end containing the total
  147. number of tokens.
  148. """
  149. ys = [0]
  150. for sentence in self.sentences:
  151. y = self.dict_sentences[sentence.sentence_index]["offset_word"][-1]
  152. ys.append(y)
  153. return ys
  154. def get_parse_trees(self):
  155. pass
  156. def get_tokens(self):
  157. list_tokens = []
  158. for sentence in self.sentences:
  159. list_tokens.extend(sentence.tokens)
  160. return list_tokens
  161. def get_lemmas(self):
  162. return []
  163. def get_token_offsets(self):
  164. list_offset = [0]
  165. for sentence in self.sentences:
  166. for token in sentence.tokens:
  167. _offset = list_offset[-1]+len(token)
  168. list_offset.append(_offset)
  169. return list_offset
  170. def get_pos(self):
  171. list_pos = []
  172. for sentence in self.sentences:
  173. list_pos.extend(sentence.pos_tags)
  174. return list_pos
  175. def get_found_entities(self, entity_key_prefix, gazette_manager=None):
  176. """
  177. Generates FoundEntity objects for the entities found.
  178. For all the entities that came from a gazette, joins
  179. the ones with the same kind.
  180. """
  181. found_entities = []
  182. tokens = self.get_tokens()
  183. for i, j, kind,alias in self.get_entity_occurrences():
  184. # alias = "".join(tokens[i:j])
  185. if gazette_manager is not None:
  186. from_gazette = gazette_manager.was_entry_created_by_gazette(alias, kind)
  187. else:
  188. from_gazette = False
  189. if from_gazette:
  190. kind = gazette_manager.strip_kind(kind)
  191. key = alias
  192. else:
  193. key = "{} {} {} {}".format(entity_key_prefix, kind, i, j)
  194. found_entities.append(FoundEntity(
  195. key=key,
  196. kind_name=kind,
  197. alias=alias,
  198. offset=i,
  199. offset_end=j,
  200. from_gazette=from_gazette
  201. ))
  202. return found_entities
  203. def get_entity_occurrences(self):
  204. """
  205. Returns a list of tuples (i, j, kind) such that `i` is the start
  206. offset of an entity occurrence, `j` is the end offset and `kind` is the
  207. entity kind of the entity.
  208. """
  209. found_entities = []
  210. for entity in self.entitys:
  211. offset_begin = entity.wordOffset_begin
  212. offset_end = entity.wordOffset_end
  213. offset_sentence = self.dict_sentences[entity.sentence_index]["offset_word"][0]
  214. found_entities.append((offset_sentence+offset_begin,offset_sentence+offset_end,entity.entity_type,entity.entity_text))
  215. return found_entities
  216. def generate_spans_relations(self):
  217. print("%s entity length:%d"%(self.docid,len(self.entitys)))
  218. list_pre_label = []
  219. for _entity in self.entitys:
  220. doc_id = _entity.doc_id
  221. offset = [[self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin,self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end]]
  222. _type = getType(_entity)
  223. ann_id = annotator.create_span_interface(document=doc_id,offsets=offset,_type=_type)
  224. _entity.ann_id = ann_id
  225. _label = "T|%s|%d|%d"%(_type,offset[0][0],offset[0][1])
  226. list_pre_label.append(_label)
  227. for _entity in self.entitys:
  228. if _entity.pointer_pack is not None:
  229. origin = _entity.ann_id
  230. target = _entity.pointer_pack.ann_id
  231. _type = dict_relations["pointer_pack"]
  232. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  233. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  234. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  235. p_target = _entity.pointer_pack
  236. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  237. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  238. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  239. list_pre_label.append(_label)
  240. if _entity.pointer_money is not None:
  241. origin = _entity.ann_id
  242. target = _entity.pointer_money.ann_id
  243. # print("$$$$$$$$",_entity.pointer_money.entity_text)
  244. _type = dict_relations["pointer_money"]
  245. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  246. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  247. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  248. p_target = _entity.pointer_money
  249. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  250. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  251. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  252. list_pre_label.append(_label)
  253. if _entity.pointer_person is not None:
  254. origin = _entity.ann_id
  255. target = _entity.pointer_person.ann_id
  256. _type = dict_relations["pointer_person"]
  257. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  258. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  259. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  260. p_target = _entity.pointer_person
  261. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  262. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  263. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  264. list_pre_label.append(_label)
  265. if _entity.pointer_address is not None:
  266. origin = _entity.ann_id
  267. target = _entity.pointer_address.ann_id
  268. _type = dict_relations["pointer_address"]
  269. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  270. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  271. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  272. p_target = _entity.pointer_address
  273. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  274. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  275. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  276. list_pre_label.append(_label)
  277. if _entity.pointer_tendereeMoney is not None:
  278. origin = _entity.ann_id
  279. target = _entity.pointer_tendereeMoney.ann_id
  280. _type = dict_relations["pointer_tendereeMoney"]
  281. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  282. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  283. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  284. p_target = _entity.pointer_tendereeMoney
  285. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  286. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  287. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  288. list_pre_label.append(_label)
  289. return list_pre_label
  290. class SelfPreprocesser(BasePreProcessStepRunner):
  291. def __init__(self, increment_ner=False):
  292. super().__init__()
  293. self.increment_ner = increment_ner
  294. self.gazette_manager = None
  295. self.override = False
  296. self.step = PreProcessSteps.brat
  297. def __call__(self, document):
  298. self.run_everything(document)
  299. def run_everything(self,document):
  300. analysis = SelfAnalizer(document.human_identifier,document.sourcetext)
  301. # Tokenization
  302. if len(analysis.entitys)>5 and len(analysis.entitys)<300:
  303. document.text = analysis.article.content
  304. tokens = analysis.get_tokens()
  305. offsets = analysis.get_token_offsets()
  306. document.set_tokenization_result(offsets, tokens)
  307. # Lemmatization
  308. document.set_lemmatization_result(analysis.get_tokens())
  309. # "Sentencing" (splitting in sentences)
  310. document.set_sentencer_result(analysis.get_sentence_boundaries())
  311. # POS tagging
  312. # document.set_tagging_result(analysis.get_pos())
  313. # Syntactic parsing
  314. # document.set_syntactic_parsing_result(analysis.get_parse_trees())
  315. # not used in brat
  316. # # NER
  317. # found_entities = analysis.get_found_entities(
  318. # document.human_identifier, self.gazette_manager
  319. # )
  320. # document.set_ner_result(found_entities)
  321. # Save progress so far, next step doesn't modify `document`
  322. document.save()
  323. list_pre_label = analysis.generate_spans_relations()
  324. document.pre_label = ';'.join(list_pre_label)
  325. document.brat_done_at = datetime.datetime.now()
  326. document.save()
  327. else:
  328. document.jump_signal = 1
  329. document.save()
  330. if __name__=="__main__":
  331. print(1)