import datetime from collections import defaultdict from itertools import chain, groupby import logging import tempfile from iepy.preprocess.pipeline import BasePreProcessStepRunner, PreProcessSteps from iepy.preprocess.ner.base import FoundEntity from iepy.data.models import EntityOccurrence, GazetteItem from brat.models import BratAnnotation as brat_annotations from iepy.selfpreprocess.BiddingKG.dl.interface.Connection import * from iepy.selfpreprocess.BiddingKG.dl.common.Utils import * from iepy.selfpreprocess.BiddingKG.dl.interface.Connection import getConnection import iepy.selfpreprocess.BiddingKG.dl.interface.predictor as predictor import iepy.selfpreprocess.BiddingKG.dl.interface.Preprocessing as Preprocessing import iepy.selfpreprocess.BiddingKG.dl.interface.getAttributes as getAttributes import iepy.selfpreprocess.BiddingKG.dl.entityLink.entityLink as entityLink import json from iepy.selfpreprocess.pipeline import PreProcessSteps import iepy.selfpreprocess.BiddingKG.dl.complaint.punish_predictor as punish_rule from iepy.webui.brat.src import annotator logger = logging.getLogger(__name__) codeNamePredict = predictor.CodeNamePredict() premPredict = predictor.PREMPredict() epcPredict = predictor.EPCPredict() roleRulePredict = predictor.RoleRulePredictor() timePredict = predictor.TimePredictor() punish = punish_rule.Punish_Extract() productPredict = predictor.ProductPredictor() def predict(doc_id,text): log("process %s"%doc_id) list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True) codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys) print(codeName) premPredict.predict(list_sentences,list_entitys) productPredict.predict(list_sentences,list_entitys) roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName) print("epcPredict") epcPredict.predict(list_sentences,list_entitys) print("entityLink") timePredict.predict(list_sentences, list_entitys) print("timePredict") entityLink.link_entitys(list_entitys) print("getPREMs") prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles) # codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys) # productPredict.predict(list_sentences,list_entitys) # premPredict.predict(list_sentences,list_entitys) # roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName) # epcPredict.predict(list_sentences,list_entitys) # timePredict.predict(list_sentences, list_entitys) # entityLink.link_entitys(list_entitys) # prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles) # log("extract done %s"%(str(prem))) return list_articles,list_sentences,list_entitys dict_type = {"org":{"0":"org_tenderee", "1":"org_agency", "2":"org_tenderer", "3":"org_secondTenderer", "4":"org_thirdTenderer", "5":"org"}, "company":{"0":"company_tenderee", "1":"company_agency", "2":"company_tenderer", "3":"company_secondTenderer", "4":"company_thirdTenderer", "5":"company"}, "money":{"0":"money_tendereeMoney", "1":"money_tendererMoney", "2":"money"}, "person":{"0":"person", "1":"person_tendereePerson", "2":"person_agencyPerson", "3":"person_person", "4":"person_review"}, "time":{"0":"time", "1":"time_release", "2":"time_bidopen", "3":"time_bidclose"}} dict_role_attribute = {"0":"att_tenderee", "1":"att_agency", "2":"att_tenderer", "3":"att_secondTenderer", "4":"att_thirdTenderer", "5":"att_noRole"} dict_money_attribute = {"0":"att_tendereeMoney", "1":"att_tendererMoney", "2":"money"} dict_person_attribute = {"0":"att_noperson", "1":"att_tendereePerson", "2":"att_agencyPerson", "3":"att_person"} dict_relations = {"pointer_pack":"rel_pack", "pointer_money":"rel_tendererMoney", "pointer_person":"rel_person", "pointer_address":"rel_address", "pointer_tendereeMoney":"rel_tendereeMoney"} def getAttribute(_entity): attribute = {"role":None,"money":None,"person":None} if _entity.entity_type in ["org","company"]: attribute["role"] = dict_role_attribute[str(_entity.label)] if _entity.entity_type in ["money"]: attribute["money"] = dict_money_attribute[str(_entity.label)] if _entity.entity_type in ["person"]: attribute["person"] = dict_person_attribute[str(_entity.label)] list_popkeys = [] for _key in attribute.keys(): if attribute[_key] is None: list_popkeys.append(_key) for _key in list_popkeys: attribute.pop(_key) return attribute def getType(_entity): if _entity.entity_type in dict_type: if str(_entity.label) in dict_type[_entity.entity_type]: return dict_type[_entity.entity_type][str(_entity.label)] return _entity.entity_type class SelfAnalizer(): def __init__(self,doc_id,sourceText): self.docid=doc_id list_articles,list_sentences,list_entitys = predict(doc_id,sourceText) self.article = list_articles[0] # print(self.article.content) self.sentences = list_sentences[0] self.entitys = list_entitys[0] self.dict_sentences = self.get_sentences() #删除原先的数据 brat_annotations.objects.filter(document_id=doc_id).delete() def get_sentences(self): dict_sentences = dict() offset_word = 0 offset_words = 0 self.sentences.sort(key=lambda x:x.sentence_index) for sentence in self.sentences: # print(len(sentence.sentence_text),sentence.sentence_text) if sentence.sentence_index not in dict_sentences: dict_sentences[sentence.sentence_index] = {"object":sentence,"offset_word":[-1,-1],"offset_words":[-1,-1]} dict_sentences[sentence.sentence_index]["offset_word"] = [offset_word,offset_word+len(sentence.sentence_text)] dict_sentences[sentence.sentence_index]["offset_words"] = [offset_words,offset_words+len(sentence.tokens)] offset_word += len(sentence.sentence_text) offset_words += len(sentence.tokens) return dict_sentences def get_sentence_boundaries(self): """ Returns a list with the offsets in tokens where each sentence starts, in order. The list contains one extra element at the end containing the total number of tokens. """ ys = [0] for sentence in self.sentences: y = self.dict_sentences[sentence.sentence_index]["offset_word"][-1] ys.append(y) return ys def get_parse_trees(self): pass def get_tokens(self): list_tokens = [] for sentence in self.sentences: list_tokens.extend(sentence.tokens) return list_tokens def get_lemmas(self): return [] def get_token_offsets(self): list_offset = [0] for sentence in self.sentences: for token in sentence.tokens: _offset = list_offset[-1]+len(token) list_offset.append(_offset) return list_offset def get_pos(self): list_pos = [] for sentence in self.sentences: list_pos.extend(sentence.pos_tags) return list_pos def get_found_entities(self, entity_key_prefix, gazette_manager=None): """ Generates FoundEntity objects for the entities found. For all the entities that came from a gazette, joins the ones with the same kind. """ found_entities = [] tokens = self.get_tokens() for i, j, kind,alias in self.get_entity_occurrences(): # alias = "".join(tokens[i:j]) if gazette_manager is not None: from_gazette = gazette_manager.was_entry_created_by_gazette(alias, kind) else: from_gazette = False if from_gazette: kind = gazette_manager.strip_kind(kind) key = alias else: key = "{} {} {} {}".format(entity_key_prefix, kind, i, j) found_entities.append(FoundEntity( key=key, kind_name=kind, alias=alias, offset=i, offset_end=j, from_gazette=from_gazette )) return found_entities def get_entity_occurrences(self): """ Returns a list of tuples (i, j, kind) such that `i` is the start offset of an entity occurrence, `j` is the end offset and `kind` is the entity kind of the entity. """ found_entities = [] for entity in self.entitys: offset_begin = entity.wordOffset_begin offset_end = entity.wordOffset_end offset_sentence = self.dict_sentences[entity.sentence_index]["offset_word"][0] found_entities.append((offset_sentence+offset_begin,offset_sentence+offset_end,entity.entity_type,entity.entity_text)) return found_entities def generate_spans_relations(self): print("%s entity length:%d"%(self.docid,len(self.entitys))) list_pre_label = [] for _entity in self.entitys: doc_id = _entity.doc_id offset = [[self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin,self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end]] _type = getType(_entity) ann_id = annotator.create_span_interface(document=doc_id,offsets=offset,_type=_type) _entity.ann_id = ann_id _label = "T|%s|%d|%d"%(_type,offset[0][0],offset[0][1]) list_pre_label.append(_label) for _entity in self.entitys: if _entity.pointer_pack is not None: origin = _entity.ann_id target = _entity.pointer_pack.ann_id _type = dict_relations["pointer_pack"] annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type) origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end p_target = _entity.pointer_pack target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end) list_pre_label.append(_label) if _entity.pointer_money is not None: origin = _entity.ann_id target = _entity.pointer_money.ann_id # print("$$$$$$$$",_entity.pointer_money.entity_text) _type = dict_relations["pointer_money"] annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type) origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end p_target = _entity.pointer_money target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end) list_pre_label.append(_label) if _entity.pointer_person is not None: origin = _entity.ann_id target = _entity.pointer_person.ann_id _type = dict_relations["pointer_person"] annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type) origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end p_target = _entity.pointer_person target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end) list_pre_label.append(_label) if _entity.pointer_address is not None: origin = _entity.ann_id target = _entity.pointer_address.ann_id _type = dict_relations["pointer_address"] annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type) origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end p_target = _entity.pointer_address target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end) list_pre_label.append(_label) if _entity.pointer_tendereeMoney is not None: origin = _entity.ann_id target = _entity.pointer_tendereeMoney.ann_id _type = dict_relations["pointer_tendereeMoney"] annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type) origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end p_target = _entity.pointer_tendereeMoney target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end) list_pre_label.append(_label) return list_pre_label class SelfPreprocesser(BasePreProcessStepRunner): def __init__(self, increment_ner=False): super().__init__() self.increment_ner = increment_ner self.gazette_manager = None self.override = False self.step = PreProcessSteps.brat def __call__(self, document): self.run_everything(document) def run_everything(self,document): analysis = SelfAnalizer(document.human_identifier,document.sourcetext) # Tokenization if len(analysis.entitys)>5 and len(analysis.entitys)<300: document.text = analysis.article.content tokens = analysis.get_tokens() offsets = analysis.get_token_offsets() document.set_tokenization_result(offsets, tokens) # Lemmatization document.set_lemmatization_result(analysis.get_tokens()) # "Sentencing" (splitting in sentences) document.set_sentencer_result(analysis.get_sentence_boundaries()) # POS tagging # document.set_tagging_result(analysis.get_pos()) # Syntactic parsing # document.set_syntactic_parsing_result(analysis.get_parse_trees()) # not used in brat # # NER # found_entities = analysis.get_found_entities( # document.human_identifier, self.gazette_manager # ) # document.set_ner_result(found_entities) # Save progress so far, next step doesn't modify `document` document.save() list_pre_label = analysis.generate_spans_relations() document.pre_label = ';'.join(list_pre_label) document.brat_done_at = datetime.datetime.now() document.save() else: document.jump_signal = 1 document.save() if __name__=="__main__": print(1)