self_preprocess.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. import datetime
  2. from collections import defaultdict
  3. from itertools import chain, groupby
  4. import logging
  5. import tempfile
  6. from iepy.preprocess.pipeline import BasePreProcessStepRunner, PreProcessSteps
  7. from iepy.preprocess.ner.base import FoundEntity
  8. from iepy.data.models import EntityOccurrence, GazetteItem
  9. from brat.models import BratAnnotation as brat_annotations
  10. from iepy.selfpreprocess.BiddingKG.dl.interface.Connection import *
  11. from iepy.selfpreprocess.BiddingKG.dl.common.Utils import *
  12. from iepy.selfpreprocess.BiddingKG.dl.interface.Connection import getConnection
  13. import iepy.selfpreprocess.BiddingKG.dl.interface.predictor as predictor
  14. import iepy.selfpreprocess.BiddingKG.dl.interface.Preprocessing as Preprocessing
  15. import iepy.selfpreprocess.BiddingKG.dl.interface.getAttributes as getAttributes
  16. import iepy.selfpreprocess.BiddingKG.dl.entityLink.entityLink as entityLink
  17. import json
  18. from iepy.selfpreprocess.pipeline import PreProcessSteps
  19. from iepy.webui.brat.src import annotator
  20. logger = logging.getLogger(__name__)
  21. codeNamePredict = predictor.CodeNamePredict()
  22. premPredict = predictor.PREMPredict()
  23. epcPredict = predictor.EPCPredict()
  24. roleRulePredict = predictor.RoleRulePredictor()
  25. timePredict = predictor.TimePredictor()
  26. def predict(doc_id,text):
  27. log("process %s"%doc_id)
  28. list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
  29. codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
  30. premPredict.predict(list_sentences,list_entitys)
  31. roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
  32. epcPredict.predict(list_sentences,list_entitys)
  33. timePredict.predict(list_sentences, list_entitys)
  34. entityLink.link_entitys(list_entitys)
  35. _prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
  36. log("extract done %s"%(str(_prem)))
  37. return list_articles,list_sentences,list_entitys
  38. dict_type = {"org":{"0":"org_tenderee",
  39. "1":"org_agency",
  40. "2":"org_tenderer",
  41. "3":"org_secondTenderer",
  42. "4":"org_thirdTenderer",
  43. "5":"org"},
  44. "company":{"0":"company_tenderee",
  45. "1":"company_agency",
  46. "2":"company_tenderer",
  47. "3":"company_secondTenderer",
  48. "4":"company_thirdTenderer",
  49. "5":"company"},
  50. "money":{"0":"money_tendereeMoney",
  51. "1":"money_tendererMoney",
  52. "2":"money"},
  53. "person":{"0":"person",
  54. "1":"person_tendereePerson",
  55. "2":"person_agencyPerson",
  56. "3":"person_person",
  57. "4":"person_review"},
  58. "time":{"0":"time",
  59. "1":"time_release",
  60. "2":"time_bidopen",
  61. "3":"time_bidclose"}}
  62. dict_role_attribute = {"0":"att_tenderee",
  63. "1":"att_agency",
  64. "2":"att_tenderer",
  65. "3":"att_secondTenderer",
  66. "4":"att_thirdTenderer",
  67. "5":"att_noRole"}
  68. dict_money_attribute = {"0":"att_tendereeMoney",
  69. "1":"att_tendererMoney",
  70. "2":"money"}
  71. dict_person_attribute = {"0":"att_noperson",
  72. "1":"att_tendereePerson",
  73. "2":"att_agencyPerson",
  74. "3":"att_person"}
  75. dict_relations = {"pointer_pack":"rel_pack",
  76. "pointer_money":"rel_tendererMoney",
  77. "pointer_person":"rel_person",
  78. "pointer_address":"rel_address",
  79. "pointer_tendereeMoney":"rel_tendereeMoney"}
  80. def getAttribute(_entity):
  81. attribute = {"role":None,"money":None,"person":None}
  82. if _entity.entity_type in ["org","company"]:
  83. attribute["role"] = dict_role_attribute[str(_entity.label)]
  84. if _entity.entity_type in ["money"]:
  85. attribute["money"] = dict_money_attribute[str(_entity.label)]
  86. if _entity.entity_type in ["person"]:
  87. attribute["person"] = dict_person_attribute[str(_entity.label)]
  88. list_popkeys = []
  89. for _key in attribute.keys():
  90. if attribute[_key] is None:
  91. list_popkeys.append(_key)
  92. for _key in list_popkeys:
  93. attribute.pop(_key)
  94. return attribute
  95. def getType(_entity):
  96. if _entity.entity_type in dict_type:
  97. if str(_entity.label) in dict_type[_entity.entity_type]:
  98. return dict_type[_entity.entity_type][str(_entity.label)]
  99. return _entity.entity_type
  100. class SelfAnalizer():
  101. def __init__(self,doc_id,sourceText):
  102. self.docid=doc_id
  103. list_articles,list_sentences,list_entitys = predict(doc_id,sourceText)
  104. self.article = list_articles[0]
  105. # print(self.article.content)
  106. self.sentences = list_sentences[0]
  107. self.entitys = list_entitys[0]
  108. self.dict_sentences = self.get_sentences()
  109. #删除原先的数据
  110. brat_annotations.objects.filter(document_id=doc_id).delete()
  111. def get_sentences(self):
  112. dict_sentences = dict()
  113. offset_word = 0
  114. offset_words = 0
  115. for sentence in self.sentences:
  116. # print(len(sentence.sentence_text),sentence.sentence_text)
  117. if sentence.sentence_index not in dict_sentences:
  118. dict_sentences[sentence.sentence_index] = {"object":sentence,"offset_word":[-1,-1],"offset_words":[-1,-1]}
  119. dict_sentences[sentence.sentence_index]["offset_word"] = [offset_word,offset_word+len(sentence.sentence_text)]
  120. dict_sentences[sentence.sentence_index]["offset_words"] = [offset_words,offset_words+len(sentence.tokens)]
  121. offset_word += len(sentence.sentence_text)
  122. offset_words += len(sentence.tokens)
  123. return dict_sentences
  124. def get_sentence_boundaries(self):
  125. """
  126. Returns a list with the offsets in tokens where each sentence starts, in
  127. order. The list contains one extra element at the end containing the total
  128. number of tokens.
  129. """
  130. ys = [0]
  131. for sentence in self.sentences:
  132. y = self.dict_sentences[sentence.sentence_index]["offset_word"][-1]
  133. ys.append(y)
  134. return ys
  135. def get_parse_trees(self):
  136. pass
  137. def get_tokens(self):
  138. list_tokens = []
  139. for sentence in self.sentences:
  140. list_tokens.extend(sentence.tokens)
  141. return list_tokens
  142. def get_lemmas(self):
  143. return []
  144. def get_token_offsets(self):
  145. list_offset = [0]
  146. for sentence in self.sentences:
  147. for token in sentence.tokens:
  148. _offset = list_offset[-1]+len(token)
  149. list_offset.append(_offset)
  150. return list_offset
  151. def get_pos(self):
  152. list_pos = []
  153. for sentence in self.sentences:
  154. list_pos.extend(sentence.pos_tags)
  155. return list_pos
  156. def get_found_entities(self, entity_key_prefix, gazette_manager=None):
  157. """
  158. Generates FoundEntity objects for the entities found.
  159. For all the entities that came from a gazette, joins
  160. the ones with the same kind.
  161. """
  162. found_entities = []
  163. tokens = self.get_tokens()
  164. for i, j, kind,alias in self.get_entity_occurrences():
  165. # alias = "".join(tokens[i:j])
  166. if gazette_manager is not None:
  167. from_gazette = gazette_manager.was_entry_created_by_gazette(alias, kind)
  168. else:
  169. from_gazette = False
  170. if from_gazette:
  171. kind = gazette_manager.strip_kind(kind)
  172. key = alias
  173. else:
  174. key = "{} {} {} {}".format(entity_key_prefix, kind, i, j)
  175. found_entities.append(FoundEntity(
  176. key=key,
  177. kind_name=kind,
  178. alias=alias,
  179. offset=i,
  180. offset_end=j,
  181. from_gazette=from_gazette
  182. ))
  183. return found_entities
  184. def get_entity_occurrences(self):
  185. """
  186. Returns a list of tuples (i, j, kind) such that `i` is the start
  187. offset of an entity occurrence, `j` is the end offset and `kind` is the
  188. entity kind of the entity.
  189. """
  190. found_entities = []
  191. for entity in self.entitys:
  192. offset_begin = entity.wordOffset_begin
  193. offset_end = entity.wordOffset_end
  194. offset_sentence = self.dict_sentences[entity.sentence_index]["offset_word"][0]
  195. found_entities.append((offset_sentence+offset_begin,offset_sentence+offset_end,entity.entity_type,entity.entity_text))
  196. return found_entities
  197. def generate_spans_relations(self):
  198. print("%s entity length:%d"%(self.docid,len(self.entitys)))
  199. for _entity in self.entitys:
  200. doc_id = _entity.doc_id
  201. offset = [[self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin,self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end]]
  202. _type = getType(_entity)
  203. ann_id = annotator.create_span_interface(document=doc_id,offsets=offset,_type=_type)
  204. _entity.ann_id = ann_id
  205. for _entity in self.entitys:
  206. if _entity.pointer_pack is not None:
  207. origin = _entity.ann_id
  208. target = _entity.pointer_pack.ann_id
  209. _type = dict_relations["pointer_pack"]
  210. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  211. if _entity.pointer_money is not None:
  212. origin = _entity.ann_id
  213. target = _entity.pointer_money.ann_id
  214. # print("$$$$$$$$",_entity.pointer_money.entity_text)
  215. _type = dict_relations["pointer_money"]
  216. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  217. if _entity.pointer_person is not None:
  218. origin = _entity.ann_id
  219. target = _entity.pointer_person.ann_id
  220. _type = dict_relations["pointer_person"]
  221. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  222. if _entity.pointer_address is not None:
  223. origin = _entity.ann_id
  224. target = _entity.pointer_address.ann_id
  225. _type = dict_relations["pointer_address"]
  226. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  227. if _entity.pointer_tendereeMoney is not None:
  228. origin = _entity.ann_id
  229. target = _entity.pointer_tendereeMoney.ann_id
  230. _type = dict_relations["pointer_tendereeMoney"]
  231. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  232. class SelfPreprocesser(BasePreProcessStepRunner):
  233. def __init__(self, increment_ner=False):
  234. super().__init__()
  235. self.increment_ner = increment_ner
  236. self.gazette_manager = None
  237. self.override = False
  238. self.step = PreProcessSteps.brat
  239. def __call__(self, document):
  240. self.run_everything(document)
  241. def run_everything(self,document):
  242. analysis = SelfAnalizer(document.human_identifier,document.sourcetext)
  243. # Tokenization
  244. if len(analysis.entitys)>5 and len(analysis.entitys)<500:
  245. document.text = analysis.article.content
  246. tokens = analysis.get_tokens()
  247. offsets = analysis.get_token_offsets()
  248. document.set_tokenization_result(offsets, tokens)
  249. # Lemmatization
  250. document.set_lemmatization_result(analysis.get_tokens())
  251. # "Sentencing" (splitting in sentences)
  252. document.set_sentencer_result(analysis.get_sentence_boundaries())
  253. # POS tagging
  254. # document.set_tagging_result(analysis.get_pos())
  255. # Syntactic parsing
  256. # document.set_syntactic_parsing_result(analysis.get_parse_trees())
  257. # not used in brat
  258. # # NER
  259. # found_entities = analysis.get_found_entities(
  260. # document.human_identifier, self.gazette_manager
  261. # )
  262. # document.set_ner_result(found_entities)
  263. # Save progress so far, next step doesn't modify `document`
  264. document.save()
  265. analysis.generate_spans_relations()
  266. document.brat_done_at = datetime.datetime.now()
  267. document.save()
  268. else:
  269. document.jump_signal = 1
  270. document.save()
  271. if __name__=="__main__":
  272. print(1)