123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397 |
- import datetime
- from collections import defaultdict
- from itertools import chain, groupby
- import logging
- import tempfile
- from iepy.preprocess.pipeline import BasePreProcessStepRunner, PreProcessSteps
- from iepy.preprocess.ner.base import FoundEntity
- from iepy.data.models import EntityOccurrence, GazetteItem
- from brat.models import BratAnnotation as brat_annotations
- from iepy.selfpreprocess.BiddingKG.dl.interface.Connection import *
- from iepy.selfpreprocess.BiddingKG.dl.common.Utils import *
- from iepy.selfpreprocess.BiddingKG.dl.interface.Connection import getConnection
- import iepy.selfpreprocess.BiddingKG.dl.interface.predictor as predictor
- import iepy.selfpreprocess.BiddingKG.dl.interface.Preprocessing as Preprocessing
- import iepy.selfpreprocess.BiddingKG.dl.interface.getAttributes as getAttributes
- import iepy.selfpreprocess.BiddingKG.dl.entityLink.entityLink as entityLink
- import json
- from iepy.selfpreprocess.pipeline import PreProcessSteps
- import iepy.selfpreprocess.BiddingKG.dl.complaint.punish_predictor as punish_rule
- from iepy.webui.brat.src import annotator
- logger = logging.getLogger(__name__)
- codeNamePredict = predictor.CodeNamePredict()
- premPredict = predictor.PREMPredict()
- epcPredict = predictor.EPCPredict()
- roleRulePredict = predictor.RoleRulePredictor()
- timePredict = predictor.TimePredictor()
- punish = punish_rule.Punish_Extract()
- productPredict = predictor.ProductPredictor()
- def predict(doc_id,text):
- log("process %s"%doc_id)
- list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
- codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
- print(codeName)
- premPredict.predict(list_sentences,list_entitys)
- productPredict.predict(list_sentences,list_entitys)
- roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
- print("epcPredict")
- epcPredict.predict(list_sentences,list_entitys)
- print("entityLink")
- timePredict.predict(list_sentences, list_entitys)
- print("timePredict")
- entityLink.link_entitys(list_entitys)
- print("getPREMs")
- prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
- log("extract done %s"%(str(prem)))
- return list_articles,list_sentences,list_entitys
- dict_type = {"org":{"0":"org_tenderee",
- "1":"org_agency",
- "2":"org_tenderer",
- "3":"org_secondTenderer",
- "4":"org_thirdTenderer",
- "5":"org"},
- "company":{"0":"company_tenderee",
- "1":"company_agency",
- "2":"company_tenderer",
- "3":"company_secondTenderer",
- "4":"company_thirdTenderer",
- "5":"company"},
- "money":{"0":"money_tendereeMoney",
- "1":"money_tendererMoney",
- "2":"money"},
- "person":{"0":"person",
- "1":"person_tendereePerson",
- "2":"person_agencyPerson",
- "3":"person_person",
- "4":"person_review"},
- "time":{"0":"time",
- "1":"time_release",
- "2":"time_bidopen",
- "3":"time_bidclose"}}
- dict_role_attribute = {"0":"att_tenderee",
- "1":"att_agency",
- "2":"att_tenderer",
- "3":"att_secondTenderer",
- "4":"att_thirdTenderer",
- "5":"att_noRole"}
- dict_money_attribute = {"0":"att_tendereeMoney",
- "1":"att_tendererMoney",
- "2":"money"}
- dict_person_attribute = {"0":"att_noperson",
- "1":"att_tendereePerson",
- "2":"att_agencyPerson",
- "3":"att_person"}
- dict_relations = {"pointer_pack":"rel_pack",
- "pointer_money":"rel_tendererMoney",
- "pointer_person":"rel_person",
- "pointer_address":"rel_address",
- "pointer_tendereeMoney":"rel_tendereeMoney",
- "person_phone":"rel_phone",
- }
- def getAttribute(_entity):
- attribute = {"role":None,"money":None,"person":None}
- if _entity.entity_type in ["org","company"]:
- attribute["role"] = dict_role_attribute[str(_entity.label)]
- if _entity.entity_type in ["money"]:
- attribute["money"] = dict_money_attribute[str(_entity.label)]
- if _entity.entity_type in ["person"]:
- attribute["person"] = dict_person_attribute[str(_entity.label)]
- list_popkeys = []
- for _key in attribute.keys():
- if attribute[_key] is None:
- list_popkeys.append(_key)
- for _key in list_popkeys:
- attribute.pop(_key)
- return attribute
- def getType(_entity):
- if _entity.entity_type in dict_type:
- if str(_entity.label) in dict_type[_entity.entity_type]:
- return dict_type[_entity.entity_type][str(_entity.label)]
- return _entity.entity_type
- class SelfAnalizer():
- def __init__(self,doc_id,sourceText):
- self.docid=doc_id
- list_articles,list_sentences,list_entitys = predict(doc_id,sourceText)
- self.article = list_articles[0]
- # print(self.article.content)
- self.sentences = list_sentences[0]
- self.entitys = list_entitys[0]
- self.dict_sentences = self.get_sentences()
- #删除原先的数据
- brat_annotations.objects.filter(document_id=doc_id).delete()
- def get_sentences(self):
- dict_sentences = dict()
- offset_word = 0
- offset_words = 0
- self.sentences.sort(key=lambda x:x.sentence_index)
- for sentence in self.sentences:
- # print(len(sentence.sentence_text),sentence.sentence_text)
- if sentence.sentence_index not in dict_sentences:
- dict_sentences[sentence.sentence_index] = {"object":sentence,"offset_word":[-1,-1],"offset_words":[-1,-1]}
- dict_sentences[sentence.sentence_index]["offset_word"] = [offset_word,offset_word+len(sentence.sentence_text)]
- dict_sentences[sentence.sentence_index]["offset_words"] = [offset_words,offset_words+len(sentence.tokens)]
- offset_word += len(sentence.sentence_text)
- offset_words += len(sentence.tokens)
- return dict_sentences
- def get_sentence_boundaries(self):
- """
- Returns a list with the offsets in tokens where each sentence starts, in
- order. The list contains one extra element at the end containing the total
- number of tokens.
- """
- ys = [0]
- for sentence in self.sentences:
- y = self.dict_sentences[sentence.sentence_index]["offset_word"][-1]
- ys.append(y)
- return ys
- def get_parse_trees(self):
- pass
- def get_tokens(self):
- list_tokens = []
- for sentence in self.sentences:
- list_tokens.extend(sentence.tokens)
- return list_tokens
- def get_lemmas(self):
- return []
- def get_token_offsets(self):
- list_offset = [0]
- for sentence in self.sentences:
- for token in sentence.tokens:
- _offset = list_offset[-1]+len(token)
- list_offset.append(_offset)
- return list_offset
- def get_pos(self):
- list_pos = []
- for sentence in self.sentences:
- list_pos.extend(sentence.pos_tags)
- return list_pos
- def get_found_entities(self, entity_key_prefix, gazette_manager=None):
- """
- Generates FoundEntity objects for the entities found.
- For all the entities that came from a gazette, joins
- the ones with the same kind.
- """
- found_entities = []
- tokens = self.get_tokens()
- for i, j, kind,alias in self.get_entity_occurrences():
- # alias = "".join(tokens[i:j])
- if gazette_manager is not None:
- from_gazette = gazette_manager.was_entry_created_by_gazette(alias, kind)
- else:
- from_gazette = False
- if from_gazette:
- kind = gazette_manager.strip_kind(kind)
- key = alias
- else:
- key = "{} {} {} {}".format(entity_key_prefix, kind, i, j)
- found_entities.append(FoundEntity(
- key=key,
- kind_name=kind,
- alias=alias,
- offset=i,
- offset_end=j,
- from_gazette=from_gazette
- ))
- return found_entities
- def get_entity_occurrences(self):
- """
- Returns a list of tuples (i, j, kind) such that `i` is the start
- offset of an entity occurrence, `j` is the end offset and `kind` is the
- entity kind of the entity.
- """
- found_entities = []
- for entity in self.entitys:
- offset_begin = entity.wordOffset_begin
- offset_end = entity.wordOffset_end
- offset_sentence = self.dict_sentences[entity.sentence_index]["offset_word"][0]
- found_entities.append((offset_sentence+offset_begin,offset_sentence+offset_end,entity.entity_type,entity.entity_text))
- return found_entities
- def generate_spans_relations(self):
- print("%s entity length:%d"%(self.docid,len(self.entitys)))
- list_pre_label = []
- for _entity in self.entitys:
- doc_id = _entity.doc_id
- offset = [[self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin,self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end]]
- _type = getType(_entity)
- ann_id = annotator.create_span_interface(document=doc_id,offsets=offset,_type=_type)
- _entity.ann_id = ann_id
- _label = "T|%s|%d|%d"%(_type,offset[0][0],offset[0][1])
- list_pre_label.append(_label)
- for _entity in self.entitys:
- if _entity.pointer_pack is not None:
- origin = _entity.ann_id
- target = _entity.pointer_pack.ann_id
- _type = dict_relations["pointer_pack"]
- annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
- origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
- origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
- p_target = _entity.pointer_pack
- target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
- target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
- _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
- list_pre_label.append(_label)
- if _entity.pointer_money is not None:
- origin = _entity.ann_id
- target = _entity.pointer_money.ann_id
- # print("$$$$$$$$",_entity.pointer_money.entity_text)
- _type = dict_relations["pointer_money"]
- annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
- origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
- origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
- p_target = _entity.pointer_money
- target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
- target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
- _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
- list_pre_label.append(_label)
- if not _entity.pointer_person:
- origin = _entity.ann_id
- for _pointer_person in _entity.pointer_person:
- target = _pointer_person.ann_id
- _type = dict_relations["pointer_person"]
- annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
- origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
- origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
- p_target = _pointer_person
- target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
- target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
- _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
- list_pre_label.append(_label)
- if not _entity.person_phone:
- origin = _entity.ann_id
- for _person_phone in _entity.person_phone:
- target = _person_phone.ann_id
- _type = dict_relations["person_phone"]
- annotator.create_arc_interface(document=_entity.doc_id, origin=origin, target=target, type=_type)
- origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0] + _entity.wordOffset_begin
- origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0] + _entity.wordOffset_end
- p_target = _person_phone
- target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][
- 0] + p_target.wordOffset_begin
- target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0] + p_target.wordOffset_end
- _label = "R|%s|%d|%d|%d|%d" % (_type, origin_begin, origin_end, target_begin, target_end)
- list_pre_label.append(_label)
- if _entity.pointer_address is not None:
- origin = _entity.ann_id
- target = _entity.pointer_address.ann_id
- _type = dict_relations["pointer_address"]
- annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
- origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
- origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
- p_target = _entity.pointer_address
- target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
- target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
- _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
- list_pre_label.append(_label)
- if _entity.pointer_tendereeMoney is not None:
- origin = _entity.ann_id
- target = _entity.pointer_tendereeMoney.ann_id
- _type = dict_relations["pointer_tendereeMoney"]
- annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
- origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
- origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
- p_target = _entity.pointer_tendereeMoney
- target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
- target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
- _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
- list_pre_label.append(_label)
- return list_pre_label
- class SelfPreprocesser(BasePreProcessStepRunner):
- def __init__(self, increment_ner=False):
- super().__init__()
- self.increment_ner = increment_ner
- self.gazette_manager = None
- self.override = False
- self.step = PreProcessSteps.brat
- def __call__(self, document):
- self.run_everything(document)
- def run_everything(self,document):
- analysis = SelfAnalizer(document.human_identifier,document.sourcetext)
- # Tokenization
- if len(analysis.entitys)>5 and len(analysis.entitys)<200 and len(analysis.sentences)<60:
- document.text = analysis.article.content
- tokens = analysis.get_tokens()
- offsets = analysis.get_token_offsets()
- document.set_tokenization_result(offsets, tokens)
- # Lemmatization
- document.set_lemmatization_result(analysis.get_tokens())
- # "Sentencing" (splitting in sentences)
- document.set_sentencer_result(analysis.get_sentence_boundaries())
- # POS tagging
- # document.set_tagging_result(analysis.get_pos())
- # Syntactic parsing
- # document.set_syntactic_parsing_result(analysis.get_parse_trees())
- # not used in brat
- # # NER
- # found_entities = analysis.get_found_entities(
- # document.human_identifier, self.gazette_manager
- # )
- # document.set_ner_result(found_entities)
- # Save progress so far, next step doesn't modify `document`
- document.save()
- list_pre_label = analysis.generate_spans_relations()
- document.pre_label = ';'.join(list_pre_label)
- document.brat_done_at = datetime.datetime.now()
- document.save()
- else:
- document.jump_signal = 1
- document.save()
- if __name__=="__main__":
- print(1)
|