self_preprocess.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. import datetime
  2. from collections import defaultdict
  3. from itertools import chain, groupby
  4. import logging
  5. import tempfile
  6. from iepy.preprocess.pipeline import BasePreProcessStepRunner, PreProcessSteps
  7. from iepy.preprocess.ner.base import FoundEntity
  8. from iepy.data.models import EntityOccurrence, GazetteItem
  9. from brat.models import BratAnnotation as brat_annotations
  10. from iepy.selfpreprocess.BiddingKG.dl.interface.Connection import *
  11. from iepy.selfpreprocess.BiddingKG.dl.common.Utils import *
  12. from iepy.selfpreprocess.BiddingKG.dl.interface.Connection import getConnection
  13. import iepy.selfpreprocess.BiddingKG.dl.interface.predictor as predictor
  14. import iepy.selfpreprocess.BiddingKG.dl.interface.Preprocessing as Preprocessing
  15. import iepy.selfpreprocess.BiddingKG.dl.interface.getAttributes as getAttributes
  16. import iepy.selfpreprocess.BiddingKG.dl.entityLink.entityLink as entityLink
  17. import json
  18. from iepy.selfpreprocess.pipeline import PreProcessSteps
  19. import iepy.selfpreprocess.BiddingKG.dl.complaint.punish_predictor as punish_rule
  20. from iepy.webui.brat.src import annotator
  21. logger = logging.getLogger(__name__)
  22. codeNamePredict = predictor.CodeNamePredict()
  23. premPredict = predictor.PREMPredict()
  24. epcPredict = predictor.EPCPredict()
  25. roleRulePredict = predictor.RoleRulePredictor()
  26. timePredict = predictor.TimePredictor()
  27. punish = punish_rule.Punish_Extract()
  28. productPredict = predictor.ProductPredictor()
  29. def predict(doc_id,text):
  30. log("process %s"%doc_id)
  31. list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
  32. codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
  33. print(codeName)
  34. premPredict.predict(list_sentences,list_entitys)
  35. productPredict.predict(list_sentences,list_entitys)
  36. roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
  37. print("epcPredict")
  38. epcPredict.predict(list_sentences,list_entitys)
  39. print("entityLink")
  40. timePredict.predict(list_sentences, list_entitys)
  41. print("timePredict")
  42. entityLink.link_entitys(list_entitys)
  43. print("getPREMs")
  44. prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
  45. log("extract done %s"%(str(prem)))
  46. return list_articles,list_sentences,list_entitys
  47. dict_type = {"org":{"0":"org_tenderee",
  48. "1":"org_agency",
  49. "2":"org_tenderer",
  50. "3":"org_secondTenderer",
  51. "4":"org_thirdTenderer",
  52. "5":"org"},
  53. "company":{"0":"company_tenderee",
  54. "1":"company_agency",
  55. "2":"company_tenderer",
  56. "3":"company_secondTenderer",
  57. "4":"company_thirdTenderer",
  58. "5":"company"},
  59. "money":{"0":"money_tendereeMoney",
  60. "1":"money_tendererMoney",
  61. "2":"money"},
  62. "person":{"0":"person",
  63. "1":"person_tendereePerson",
  64. "2":"person_agencyPerson",
  65. "3":"person_person",
  66. "4":"person_review"},
  67. "time":{"0":"time",
  68. "1":"time_release",
  69. "2":"time_bidopen",
  70. "3":"time_bidclose"}}
  71. dict_role_attribute = {"0":"att_tenderee",
  72. "1":"att_agency",
  73. "2":"att_tenderer",
  74. "3":"att_secondTenderer",
  75. "4":"att_thirdTenderer",
  76. "5":"att_noRole"}
  77. dict_money_attribute = {"0":"att_tendereeMoney",
  78. "1":"att_tendererMoney",
  79. "2":"money"}
  80. dict_person_attribute = {"0":"att_noperson",
  81. "1":"att_tendereePerson",
  82. "2":"att_agencyPerson",
  83. "3":"att_person"}
  84. dict_relations = {"pointer_pack":"rel_pack",
  85. "pointer_money":"rel_tendererMoney",
  86. "pointer_person":"rel_person",
  87. "pointer_address":"rel_address",
  88. "pointer_tendereeMoney":"rel_tendereeMoney"}
  89. def getAttribute(_entity):
  90. attribute = {"role":None,"money":None,"person":None}
  91. if _entity.entity_type in ["org","company"]:
  92. attribute["role"] = dict_role_attribute[str(_entity.label)]
  93. if _entity.entity_type in ["money"]:
  94. attribute["money"] = dict_money_attribute[str(_entity.label)]
  95. if _entity.entity_type in ["person"]:
  96. attribute["person"] = dict_person_attribute[str(_entity.label)]
  97. list_popkeys = []
  98. for _key in attribute.keys():
  99. if attribute[_key] is None:
  100. list_popkeys.append(_key)
  101. for _key in list_popkeys:
  102. attribute.pop(_key)
  103. return attribute
  104. def getType(_entity):
  105. if _entity.entity_type in dict_type:
  106. if str(_entity.label) in dict_type[_entity.entity_type]:
  107. return dict_type[_entity.entity_type][str(_entity.label)]
  108. return _entity.entity_type
  109. class SelfAnalizer():
  110. def __init__(self,doc_id,sourceText):
  111. self.docid=doc_id
  112. list_articles,list_sentences,list_entitys = predict(doc_id,sourceText)
  113. self.article = list_articles[0]
  114. # print(self.article.content)
  115. self.sentences = list_sentences[0]
  116. self.entitys = list_entitys[0]
  117. self.dict_sentences = self.get_sentences()
  118. #删除原先的数据
  119. brat_annotations.objects.filter(document_id=doc_id).delete()
  120. def get_sentences(self):
  121. dict_sentences = dict()
  122. offset_word = 0
  123. offset_words = 0
  124. self.sentences.sort(key=lambda x:x.sentence_index)
  125. for sentence in self.sentences:
  126. # print(len(sentence.sentence_text),sentence.sentence_text)
  127. if sentence.sentence_index not in dict_sentences:
  128. dict_sentences[sentence.sentence_index] = {"object":sentence,"offset_word":[-1,-1],"offset_words":[-1,-1]}
  129. dict_sentences[sentence.sentence_index]["offset_word"] = [offset_word,offset_word+len(sentence.sentence_text)]
  130. dict_sentences[sentence.sentence_index]["offset_words"] = [offset_words,offset_words+len(sentence.tokens)]
  131. offset_word += len(sentence.sentence_text)
  132. offset_words += len(sentence.tokens)
  133. return dict_sentences
  134. def get_sentence_boundaries(self):
  135. """
  136. Returns a list with the offsets in tokens where each sentence starts, in
  137. order. The list contains one extra element at the end containing the total
  138. number of tokens.
  139. """
  140. ys = [0]
  141. for sentence in self.sentences:
  142. y = self.dict_sentences[sentence.sentence_index]["offset_word"][-1]
  143. ys.append(y)
  144. return ys
  145. def get_parse_trees(self):
  146. pass
  147. def get_tokens(self):
  148. list_tokens = []
  149. for sentence in self.sentences:
  150. list_tokens.extend(sentence.tokens)
  151. return list_tokens
  152. def get_lemmas(self):
  153. return []
  154. def get_token_offsets(self):
  155. list_offset = [0]
  156. for sentence in self.sentences:
  157. for token in sentence.tokens:
  158. _offset = list_offset[-1]+len(token)
  159. list_offset.append(_offset)
  160. return list_offset
  161. def get_pos(self):
  162. list_pos = []
  163. for sentence in self.sentences:
  164. list_pos.extend(sentence.pos_tags)
  165. return list_pos
  166. def get_found_entities(self, entity_key_prefix, gazette_manager=None):
  167. """
  168. Generates FoundEntity objects for the entities found.
  169. For all the entities that came from a gazette, joins
  170. the ones with the same kind.
  171. """
  172. found_entities = []
  173. tokens = self.get_tokens()
  174. for i, j, kind,alias in self.get_entity_occurrences():
  175. # alias = "".join(tokens[i:j])
  176. if gazette_manager is not None:
  177. from_gazette = gazette_manager.was_entry_created_by_gazette(alias, kind)
  178. else:
  179. from_gazette = False
  180. if from_gazette:
  181. kind = gazette_manager.strip_kind(kind)
  182. key = alias
  183. else:
  184. key = "{} {} {} {}".format(entity_key_prefix, kind, i, j)
  185. found_entities.append(FoundEntity(
  186. key=key,
  187. kind_name=kind,
  188. alias=alias,
  189. offset=i,
  190. offset_end=j,
  191. from_gazette=from_gazette
  192. ))
  193. return found_entities
  194. def get_entity_occurrences(self):
  195. """
  196. Returns a list of tuples (i, j, kind) such that `i` is the start
  197. offset of an entity occurrence, `j` is the end offset and `kind` is the
  198. entity kind of the entity.
  199. """
  200. found_entities = []
  201. for entity in self.entitys:
  202. offset_begin = entity.wordOffset_begin
  203. offset_end = entity.wordOffset_end
  204. offset_sentence = self.dict_sentences[entity.sentence_index]["offset_word"][0]
  205. found_entities.append((offset_sentence+offset_begin,offset_sentence+offset_end,entity.entity_type,entity.entity_text))
  206. return found_entities
  207. def generate_spans_relations(self):
  208. print("%s entity length:%d"%(self.docid,len(self.entitys)))
  209. list_pre_label = []
  210. for _entity in self.entitys:
  211. doc_id = _entity.doc_id
  212. offset = [[self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin,self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end]]
  213. _type = getType(_entity)
  214. ann_id = annotator.create_span_interface(document=doc_id,offsets=offset,_type=_type)
  215. _entity.ann_id = ann_id
  216. _label = "T|%s|%d|%d"%(_type,offset[0][0],offset[0][1])
  217. list_pre_label.append(_label)
  218. for _entity in self.entitys:
  219. if _entity.pointer_pack is not None:
  220. origin = _entity.ann_id
  221. target = _entity.pointer_pack.ann_id
  222. _type = dict_relations["pointer_pack"]
  223. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  224. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  225. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  226. p_target = _entity.pointer_pack
  227. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  228. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  229. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  230. list_pre_label.append(_label)
  231. if _entity.pointer_money is not None:
  232. origin = _entity.ann_id
  233. target = _entity.pointer_money.ann_id
  234. # print("$$$$$$$$",_entity.pointer_money.entity_text)
  235. _type = dict_relations["pointer_money"]
  236. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  237. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  238. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  239. p_target = _entity.pointer_money
  240. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  241. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  242. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  243. list_pre_label.append(_label)
  244. if _entity.pointer_person is not None:
  245. origin = _entity.ann_id
  246. target = _entity.pointer_person.ann_id
  247. _type = dict_relations["pointer_person"]
  248. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  249. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  250. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  251. p_target = _entity.pointer_person
  252. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  253. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  254. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  255. list_pre_label.append(_label)
  256. if _entity.pointer_address is not None:
  257. origin = _entity.ann_id
  258. target = _entity.pointer_address.ann_id
  259. _type = dict_relations["pointer_address"]
  260. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  261. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  262. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  263. p_target = _entity.pointer_address
  264. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  265. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  266. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  267. list_pre_label.append(_label)
  268. if _entity.pointer_tendereeMoney is not None:
  269. origin = _entity.ann_id
  270. target = _entity.pointer_tendereeMoney.ann_id
  271. _type = dict_relations["pointer_tendereeMoney"]
  272. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  273. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  274. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  275. p_target = _entity.pointer_tendereeMoney
  276. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  277. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  278. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  279. list_pre_label.append(_label)
  280. return list_pre_label
  281. class SelfPreprocesser(BasePreProcessStepRunner):
  282. def __init__(self, increment_ner=False):
  283. super().__init__()
  284. self.increment_ner = increment_ner
  285. self.gazette_manager = None
  286. self.override = False
  287. self.step = PreProcessSteps.brat
  288. def __call__(self, document):
  289. self.run_everything(document)
  290. def run_everything(self,document):
  291. analysis = SelfAnalizer(document.human_identifier,document.sourcetext)
  292. # Tokenization
  293. if len(analysis.entitys)>5 and len(analysis.entitys)<200 and len(analysis.sentences)<60:
  294. document.text = analysis.article.content
  295. tokens = analysis.get_tokens()
  296. offsets = analysis.get_token_offsets()
  297. document.set_tokenization_result(offsets, tokens)
  298. # Lemmatization
  299. document.set_lemmatization_result(analysis.get_tokens())
  300. # "Sentencing" (splitting in sentences)
  301. document.set_sentencer_result(analysis.get_sentence_boundaries())
  302. # POS tagging
  303. # document.set_tagging_result(analysis.get_pos())
  304. # Syntactic parsing
  305. # document.set_syntactic_parsing_result(analysis.get_parse_trees())
  306. # not used in brat
  307. # # NER
  308. # found_entities = analysis.get_found_entities(
  309. # document.human_identifier, self.gazette_manager
  310. # )
  311. # document.set_ner_result(found_entities)
  312. # Save progress so far, next step doesn't modify `document`
  313. document.save()
  314. list_pre_label = analysis.generate_spans_relations()
  315. document.pre_label = ';'.join(list_pre_label)
  316. document.brat_done_at = datetime.datetime.now()
  317. document.save()
  318. else:
  319. document.jump_signal = 1
  320. document.save()
  321. if __name__=="__main__":
  322. print(1)