self_preprocess.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. import datetime
  2. from collections import defaultdict
  3. from itertools import chain, groupby
  4. import logging
  5. import tempfile
  6. from iepy.preprocess.pipeline import BasePreProcessStepRunner, PreProcessSteps
  7. from iepy.preprocess.ner.base import FoundEntity
  8. from iepy.data.models import EntityOccurrence, GazetteItem
  9. from brat.models import BratAnnotation as brat_annotations
  10. from iepy.selfpreprocess.BiddingKG.dl.interface.Connection import *
  11. from iepy.selfpreprocess.BiddingKG.dl.common.Utils import *
  12. from iepy.selfpreprocess.BiddingKG.dl.interface.Connection import getConnection
  13. import iepy.selfpreprocess.BiddingKG.dl.interface.predictor as predictor
  14. import iepy.selfpreprocess.BiddingKG.dl.interface.Preprocessing as Preprocessing
  15. import iepy.selfpreprocess.BiddingKG.dl.interface.getAttributes as getAttributes
  16. import iepy.selfpreprocess.BiddingKG.dl.entityLink.entityLink as entityLink
  17. import json
  18. from iepy.selfpreprocess.pipeline import PreProcessSteps
  19. import iepy.selfpreprocess.BiddingKG.dl.complaint.punish_predictor as punish_rule
  20. from iepy.webui.brat.src import annotator
  21. logger = logging.getLogger(__name__)
  22. codeNamePredict = predictor.CodeNamePredict()
  23. premPredict = predictor.PREMPredict()
  24. epcPredict = predictor.EPCPredict()
  25. roleRulePredict = predictor.RoleRulePredictor()
  26. timePredict = predictor.TimePredictor()
  27. punish = punish_rule.Punish_Extract()
  28. productPredict = predictor.ProductPredictor()
  29. def predict(doc_id,text):
  30. log("process %s"%doc_id)
  31. list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
  32. codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
  33. print(codeName)
  34. premPredict.predict(list_sentences,list_entitys)
  35. productPredict.predict(list_sentences,list_entitys)
  36. roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
  37. print("epcPredict")
  38. epcPredict.predict(list_sentences,list_entitys)
  39. print("entityLink")
  40. timePredict.predict(list_sentences, list_entitys)
  41. print("timePredict")
  42. entityLink.link_entitys(list_entitys)
  43. print("getPREMs")
  44. prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
  45. log("extract done %s"%(str(prem)))
  46. return list_articles,list_sentences,list_entitys
  47. dict_type = {"org":{"0":"org_tenderee",
  48. "1":"org_agency",
  49. "2":"org_tenderer",
  50. "3":"org_secondTenderer",
  51. "4":"org_thirdTenderer",
  52. "5":"org"},
  53. "company":{"0":"company_tenderee",
  54. "1":"company_agency",
  55. "2":"company_tenderer",
  56. "3":"company_secondTenderer",
  57. "4":"company_thirdTenderer",
  58. "5":"company"},
  59. "money":{"0":"money_tendereeMoney",
  60. "1":"money_tendererMoney",
  61. "2":"money"},
  62. "person":{"0":"person",
  63. "1":"person_tendereePerson",
  64. "2":"person_agencyPerson",
  65. "3":"person_person",
  66. "4":"person_review"},
  67. "time":{"0":"time",
  68. "1":"time_release",
  69. "2":"time_bidopen",
  70. "3":"time_bidclose"}}
  71. dict_role_attribute = {"0":"att_tenderee",
  72. "1":"att_agency",
  73. "2":"att_tenderer",
  74. "3":"att_secondTenderer",
  75. "4":"att_thirdTenderer",
  76. "5":"att_noRole"}
  77. dict_money_attribute = {"0":"att_tendereeMoney",
  78. "1":"att_tendererMoney",
  79. "2":"money"}
  80. dict_person_attribute = {"0":"att_noperson",
  81. "1":"att_tendereePerson",
  82. "2":"att_agencyPerson",
  83. "3":"att_person"}
  84. dict_relations = {"pointer_pack":"rel_pack",
  85. "pointer_money":"rel_tendererMoney",
  86. "pointer_person":"rel_person",
  87. "pointer_address":"rel_address",
  88. "pointer_tendereeMoney":"rel_tendereeMoney",
  89. "person_phone":"rel_phone",
  90. }
  91. def getAttribute(_entity):
  92. attribute = {"role":None,"money":None,"person":None}
  93. if _entity.entity_type in ["org","company"]:
  94. attribute["role"] = dict_role_attribute[str(_entity.label)]
  95. if _entity.entity_type in ["money"]:
  96. attribute["money"] = dict_money_attribute[str(_entity.label)]
  97. if _entity.entity_type in ["person"]:
  98. attribute["person"] = dict_person_attribute[str(_entity.label)]
  99. list_popkeys = []
  100. for _key in attribute.keys():
  101. if attribute[_key] is None:
  102. list_popkeys.append(_key)
  103. for _key in list_popkeys:
  104. attribute.pop(_key)
  105. return attribute
  106. def getType(_entity):
  107. if _entity.entity_type in dict_type:
  108. if str(_entity.label) in dict_type[_entity.entity_type]:
  109. return dict_type[_entity.entity_type][str(_entity.label)]
  110. return _entity.entity_type
  111. class SelfAnalizer():
  112. def __init__(self,doc_id,sourceText):
  113. self.docid=doc_id
  114. list_articles,list_sentences,list_entitys = predict(doc_id,sourceText)
  115. self.article = list_articles[0]
  116. # print(self.article.content)
  117. self.sentences = list_sentences[0]
  118. self.entitys = list_entitys[0]
  119. self.dict_sentences = self.get_sentences()
  120. #删除原先的数据
  121. brat_annotations.objects.filter(document_id=doc_id).delete()
  122. def get_sentences(self):
  123. dict_sentences = dict()
  124. offset_word = 0
  125. offset_words = 0
  126. self.sentences.sort(key=lambda x:x.sentence_index)
  127. for sentence in self.sentences:
  128. # print(len(sentence.sentence_text),sentence.sentence_text)
  129. if sentence.sentence_index not in dict_sentences:
  130. dict_sentences[sentence.sentence_index] = {"object":sentence,"offset_word":[-1,-1],"offset_words":[-1,-1]}
  131. dict_sentences[sentence.sentence_index]["offset_word"] = [offset_word,offset_word+len(sentence.sentence_text)]
  132. dict_sentences[sentence.sentence_index]["offset_words"] = [offset_words,offset_words+len(sentence.tokens)]
  133. offset_word += len(sentence.sentence_text)
  134. offset_words += len(sentence.tokens)
  135. return dict_sentences
  136. def get_sentence_boundaries(self):
  137. """
  138. Returns a list with the offsets in tokens where each sentence starts, in
  139. order. The list contains one extra element at the end containing the total
  140. number of tokens.
  141. """
  142. ys = [0]
  143. for sentence in self.sentences:
  144. y = self.dict_sentences[sentence.sentence_index]["offset_word"][-1]
  145. ys.append(y)
  146. return ys
  147. def get_parse_trees(self):
  148. pass
  149. def get_tokens(self):
  150. list_tokens = []
  151. for sentence in self.sentences:
  152. list_tokens.extend(sentence.tokens)
  153. return list_tokens
  154. def get_lemmas(self):
  155. return []
  156. def get_token_offsets(self):
  157. list_offset = [0]
  158. for sentence in self.sentences:
  159. for token in sentence.tokens:
  160. _offset = list_offset[-1]+len(token)
  161. list_offset.append(_offset)
  162. return list_offset
  163. def get_pos(self):
  164. list_pos = []
  165. for sentence in self.sentences:
  166. list_pos.extend(sentence.pos_tags)
  167. return list_pos
  168. def get_found_entities(self, entity_key_prefix, gazette_manager=None):
  169. """
  170. Generates FoundEntity objects for the entities found.
  171. For all the entities that came from a gazette, joins
  172. the ones with the same kind.
  173. """
  174. found_entities = []
  175. tokens = self.get_tokens()
  176. for i, j, kind,alias in self.get_entity_occurrences():
  177. # alias = "".join(tokens[i:j])
  178. if gazette_manager is not None:
  179. from_gazette = gazette_manager.was_entry_created_by_gazette(alias, kind)
  180. else:
  181. from_gazette = False
  182. if from_gazette:
  183. kind = gazette_manager.strip_kind(kind)
  184. key = alias
  185. else:
  186. key = "{} {} {} {}".format(entity_key_prefix, kind, i, j)
  187. found_entities.append(FoundEntity(
  188. key=key,
  189. kind_name=kind,
  190. alias=alias,
  191. offset=i,
  192. offset_end=j,
  193. from_gazette=from_gazette
  194. ))
  195. return found_entities
  196. def get_entity_occurrences(self):
  197. """
  198. Returns a list of tuples (i, j, kind) such that `i` is the start
  199. offset of an entity occurrence, `j` is the end offset and `kind` is the
  200. entity kind of the entity.
  201. """
  202. found_entities = []
  203. for entity in self.entitys:
  204. offset_begin = entity.wordOffset_begin
  205. offset_end = entity.wordOffset_end
  206. offset_sentence = self.dict_sentences[entity.sentence_index]["offset_word"][0]
  207. found_entities.append((offset_sentence+offset_begin,offset_sentence+offset_end,entity.entity_type,entity.entity_text))
  208. return found_entities
  209. def generate_spans_relations(self):
  210. print("%s entity length:%d"%(self.docid,len(self.entitys)))
  211. list_pre_label = []
  212. for _entity in self.entitys:
  213. doc_id = _entity.doc_id
  214. offset = [[self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin,self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end]]
  215. _type = getType(_entity)
  216. ann_id = annotator.create_span_interface(document=doc_id,offsets=offset,_type=_type)
  217. _entity.ann_id = ann_id
  218. _label = "T|%s|%d|%d"%(_type,offset[0][0],offset[0][1])
  219. list_pre_label.append(_label)
  220. for _entity in self.entitys:
  221. if _entity.pointer_pack is not None:
  222. origin = _entity.ann_id
  223. target = _entity.pointer_pack.ann_id
  224. _type = dict_relations["pointer_pack"]
  225. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  226. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  227. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  228. p_target = _entity.pointer_pack
  229. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  230. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  231. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  232. list_pre_label.append(_label)
  233. if _entity.pointer_money is not None:
  234. origin = _entity.ann_id
  235. target = _entity.pointer_money.ann_id
  236. # print("$$$$$$$$",_entity.pointer_money.entity_text)
  237. _type = dict_relations["pointer_money"]
  238. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  239. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  240. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  241. p_target = _entity.pointer_money
  242. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  243. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  244. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  245. list_pre_label.append(_label)
  246. if not _entity.pointer_person:
  247. origin = _entity.ann_id
  248. for _pointer_person in _entity.pointer_person:
  249. target = _pointer_person.ann_id
  250. _type = dict_relations["pointer_person"]
  251. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  252. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  253. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  254. p_target = _pointer_person
  255. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  256. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  257. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  258. list_pre_label.append(_label)
  259. if not _entity.person_phone:
  260. origin = _entity.ann_id
  261. for _person_phone in _entity.person_phone:
  262. target = _person_phone.ann_id
  263. _type = dict_relations["person_phone"]
  264. annotator.create_arc_interface(document=_entity.doc_id, origin=origin, target=target, type=_type)
  265. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0] + _entity.wordOffset_begin
  266. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0] + _entity.wordOffset_end
  267. p_target = _person_phone
  268. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][
  269. 0] + p_target.wordOffset_begin
  270. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0] + p_target.wordOffset_end
  271. _label = "R|%s|%d|%d|%d|%d" % (_type, origin_begin, origin_end, target_begin, target_end)
  272. list_pre_label.append(_label)
  273. if _entity.pointer_address is not None:
  274. origin = _entity.ann_id
  275. target = _entity.pointer_address.ann_id
  276. _type = dict_relations["pointer_address"]
  277. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  278. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  279. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  280. p_target = _entity.pointer_address
  281. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  282. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  283. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  284. list_pre_label.append(_label)
  285. if _entity.pointer_tendereeMoney is not None:
  286. origin = _entity.ann_id
  287. target = _entity.pointer_tendereeMoney.ann_id
  288. _type = dict_relations["pointer_tendereeMoney"]
  289. annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
  290. origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
  291. origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
  292. p_target = _entity.pointer_tendereeMoney
  293. target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
  294. target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
  295. _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
  296. list_pre_label.append(_label)
  297. return list_pre_label
  298. class SelfPreprocesser(BasePreProcessStepRunner):
  299. def __init__(self, increment_ner=False):
  300. super().__init__()
  301. self.increment_ner = increment_ner
  302. self.gazette_manager = None
  303. self.override = False
  304. self.step = PreProcessSteps.brat
  305. def __call__(self, document):
  306. self.run_everything(document)
  307. def run_everything(self,document):
  308. analysis = SelfAnalizer(document.human_identifier,document.sourcetext)
  309. # Tokenization
  310. if len(analysis.entitys)>5 and len(analysis.entitys)<200 and len(analysis.sentences)<60:
  311. document.text = analysis.article.content
  312. tokens = analysis.get_tokens()
  313. offsets = analysis.get_token_offsets()
  314. document.set_tokenization_result(offsets, tokens)
  315. # Lemmatization
  316. document.set_lemmatization_result(analysis.get_tokens())
  317. # "Sentencing" (splitting in sentences)
  318. document.set_sentencer_result(analysis.get_sentence_boundaries())
  319. # POS tagging
  320. # document.set_tagging_result(analysis.get_pos())
  321. # Syntactic parsing
  322. # document.set_syntactic_parsing_result(analysis.get_parse_trees())
  323. # not used in brat
  324. # # NER
  325. # found_entities = analysis.get_found_entities(
  326. # document.human_identifier, self.gazette_manager
  327. # )
  328. # document.set_ner_result(found_entities)
  329. # Save progress so far, next step doesn't modify `document`
  330. document.save()
  331. list_pre_label = analysis.generate_spans_relations()
  332. document.pre_label = ';'.join(list_pre_label)
  333. document.brat_done_at = datetime.datetime.now()
  334. document.save()
  335. else:
  336. document.jump_signal = 1
  337. document.save()
  338. if __name__=="__main__":
  339. print(1)