luojiehua
/
iepy-develop


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
							import datetime
from collections import defaultdict
from itertools import chain, groupby
import logging
import tempfile

from iepy.preprocess.pipeline import BasePreProcessStepRunner, PreProcessSteps
from iepy.preprocess.ner.base import FoundEntity
from iepy.data.models import EntityOccurrence, GazetteItem
from brat.models import BratAnnotation as brat_annotations

from iepy.selfpreprocess.BiddingKG.dl.interface.Connection import *
from iepy.selfpreprocess.BiddingKG.dl.common.Utils import *
from iepy.selfpreprocess.BiddingKG.dl.interface.Connection import getConnection
import iepy.selfpreprocess.BiddingKG.dl.interface.predictor as predictor
import iepy.selfpreprocess.BiddingKG.dl.interface.Preprocessing as Preprocessing
import iepy.selfpreprocess.BiddingKG.dl.interface.getAttributes as getAttributes
import iepy.selfpreprocess.BiddingKG.dl.entityLink.entityLink as entityLink
import json
from iepy.selfpreprocess.pipeline import PreProcessSteps

from iepy.webui.brat.src import annotator


logger = logging.getLogger(__name__)

codeNamePredict = predictor.CodeNamePredict()
premPredict = predictor.PREMPredict()
epcPredict = predictor.EPCPredict()
roleRulePredict = predictor.RoleRulePredictor()
timePredict = predictor.TimePredictor()

def predict(doc_id,text):

    log("process %s"%doc_id)
    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)

    codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
    premPredict.predict(list_sentences,list_entitys)
    roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
    epcPredict.predict(list_sentences,list_entitys)
    timePredict.predict(list_sentences, list_entitys)
    entityLink.link_entitys(list_entitys)
    _prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
    log("extract done %s"%(str(_prem)))
    return list_articles,list_sentences,list_entitys


dict_type = {"org":{"0":"org_tenderee",
                    "1":"org_agency",
                    "2":"org_tenderer",
                    "3":"org_secondTenderer",
                    "4":"org_thirdTenderer",
                    "5":"org"},
             "company":{"0":"company_tenderee",
                        "1":"company_agency",
                        "2":"company_tenderer",
                        "3":"company_secondTenderer",
                        "4":"company_thirdTenderer",
                        "5":"company"},
             "money":{"0":"money_tendereeMoney",
                      "1":"money_tendererMoney",
                      "2":"money"},
             "person":{"0":"person",
                       "1":"person_tendereePerson",
                       "2":"person_agencyPerson",
                       "3":"person_person",
                       "4":"person_review"},
             "time":{"0":"time",
                     "1":"time_release",
                     "2":"time_bidopen",
                     "3":"time_bidclose"}}

dict_role_attribute = {"0":"att_tenderee",
                       "1":"att_agency",
                       "2":"att_tenderer",
                       "3":"att_secondTenderer",
                       "4":"att_thirdTenderer",
                       "5":"att_noRole"}

dict_money_attribute = {"0":"att_tendereeMoney",
                        "1":"att_tendererMoney",
                        "2":"money"}

dict_person_attribute = {"0":"att_noperson",
                         "1":"att_tendereePerson",
                         "2":"att_agencyPerson",
                         "3":"att_person"}

dict_relations = {"pointer_pack":"rel_pack",
                  "pointer_money":"rel_tendererMoney",
                  "pointer_person":"rel_person",
                  "pointer_address":"rel_address",
                  "pointer_tendereeMoney":"rel_tendereeMoney"}


def getAttribute(_entity):
    attribute = {"role":None,"money":None,"person":None}
    if _entity.entity_type in ["org","company"]:
        attribute["role"] = dict_role_attribute[str(_entity.label)]
    if _entity.entity_type in ["money"]:
        attribute["money"] = dict_money_attribute[str(_entity.label)]
    if _entity.entity_type in ["person"]:
        attribute["person"] = dict_person_attribute[str(_entity.label)]
    list_popkeys = []
    for _key in attribute.keys():
        if attribute[_key] is None:
            list_popkeys.append(_key)
    for _key in list_popkeys:
        attribute.pop(_key)
    return attribute

def getType(_entity):
    if _entity.entity_type in dict_type:
        if str(_entity.label) in dict_type[_entity.entity_type]:
            return dict_type[_entity.entity_type][str(_entity.label)]
    return _entity.entity_type


class SelfAnalizer():

    def __init__(self,doc_id,sourceText):
        self.docid=doc_id
        list_articles,list_sentences,list_entitys = predict(doc_id,sourceText)
        self.article = list_articles[0]
        # print(self.article.content)
        self.sentences = list_sentences[0]
        self.entitys = list_entitys[0]
        self.dict_sentences = self.get_sentences()
        #删除原先的数据
        brat_annotations.objects.filter(document_id=doc_id).delete()

    def get_sentences(self):
        dict_sentences = dict()
        offset_word = 0
        offset_words = 0
        for sentence in self.sentences:
            # print(len(sentence.sentence_text),sentence.sentence_text)
            if sentence.sentence_index not in dict_sentences:
                dict_sentences[sentence.sentence_index] = {"object":sentence,"offset_word":[-1,-1],"offset_words":[-1,-1]}
            dict_sentences[sentence.sentence_index]["offset_word"] = [offset_word,offset_word+len(sentence.sentence_text)]
            dict_sentences[sentence.sentence_index]["offset_words"] = [offset_words,offset_words+len(sentence.tokens)]
            offset_word += len(sentence.sentence_text)
            offset_words += len(sentence.tokens)
        return dict_sentences

    def get_sentence_boundaries(self):
        """
        Returns a list with the offsets in tokens where each sentence starts, in
        order. The list contains one extra element at the end containing the total
        number of tokens.
        """
        ys = [0]
        for sentence in self.sentences:
            y = self.dict_sentences[sentence.sentence_index]["offset_word"][-1]
            ys.append(y)
        return ys

    def get_parse_trees(self):
        pass

    def get_tokens(self):
        list_tokens = []
        for sentence in self.sentences:
            list_tokens.extend(sentence.tokens)
        return list_tokens

    def get_lemmas(self):
        return []

    def get_token_offsets(self):
        list_offset = [0]
        for sentence in self.sentences:
            for token in sentence.tokens:
                _offset = list_offset[-1]+len(token)
                list_offset.append(_offset)
        return list_offset

    def get_pos(self):
        list_pos = []
        for sentence in self.sentences:
            list_pos.extend(sentence.pos_tags)
        return list_pos

    def get_found_entities(self, entity_key_prefix, gazette_manager=None):
        """
        Generates FoundEntity objects for the entities found.
        For all the entities that came from a gazette, joins
        the ones with the same kind.
        """
        found_entities = []
        tokens = self.get_tokens()
        for i, j, kind,alias in self.get_entity_occurrences():
            # alias = "".join(tokens[i:j])

            if gazette_manager is not None:
                from_gazette = gazette_manager.was_entry_created_by_gazette(alias, kind)
            else:
                from_gazette = False

            if from_gazette:
                kind = gazette_manager.strip_kind(kind)
                key = alias
            else:
                key = "{} {} {} {}".format(entity_key_prefix, kind, i, j)

            found_entities.append(FoundEntity(
                key=key,
                kind_name=kind,
                alias=alias,
                offset=i,
                offset_end=j,
                from_gazette=from_gazette
            ))
        return found_entities

    def get_entity_occurrences(self):
        """
        Returns a list of tuples (i, j, kind) such that `i` is the start
        offset of an entity occurrence, `j` is the end offset and `kind` is the
        entity kind of the entity.
        """
        found_entities = []
        for entity in self.entitys:
            offset_begin = entity.wordOffset_begin
            offset_end = entity.wordOffset_end
            offset_sentence = self.dict_sentences[entity.sentence_index]["offset_word"][0]
            found_entities.append((offset_sentence+offset_begin,offset_sentence+offset_end,entity.entity_type,entity.entity_text))
        return found_entities

    def generate_spans_relations(self):
        print("%s entity length:%d"%(self.docid,len(self.entitys)))
        for _entity in self.entitys:
            doc_id = _entity.doc_id
            offset = [[self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin,self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end]]
            _type = getType(_entity)
            ann_id = annotator.create_span_interface(document=doc_id,offsets=offset,_type=_type)
            _entity.ann_id = ann_id
        for _entity in self.entitys:
            if _entity.pointer_pack is not None:
                origin = _entity.ann_id
                target = _entity.pointer_pack.ann_id
                _type = dict_relations["pointer_pack"]
                annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
            if _entity.pointer_money is not None:
                origin = _entity.ann_id
                target = _entity.pointer_money.ann_id
                # print("$$$$$$$$",_entity.pointer_money.entity_text)
                _type = dict_relations["pointer_money"]
                annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
            if _entity.pointer_person is not None:
                origin = _entity.ann_id
                target = _entity.pointer_person.ann_id
                _type = dict_relations["pointer_person"]
                annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
            if _entity.pointer_address is not None:
                origin = _entity.ann_id
                target = _entity.pointer_address.ann_id
                _type = dict_relations["pointer_address"]
                annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
            if _entity.pointer_tendereeMoney is not None:
                origin = _entity.ann_id
                target = _entity.pointer_tendereeMoney.ann_id
                _type = dict_relations["pointer_tendereeMoney"]
                annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)


class SelfPreprocesser(BasePreProcessStepRunner):

    def __init__(self, increment_ner=False):
        super().__init__()
        self.increment_ner = increment_ner
        self.gazette_manager = None
        self.override = False
        self.step = PreProcessSteps.brat

    def __call__(self, document):
        self.run_everything(document)

    def run_everything(self,document):
        analysis = SelfAnalizer(document.human_identifier,document.sourcetext)
        # Tokenization
        if len(analysis.entitys)>5 and len(analysis.entitys)<500:
            document.text = analysis.article.content
            tokens = analysis.get_tokens()
            offsets = analysis.get_token_offsets()

            document.set_tokenization_result(offsets, tokens)

            # Lemmatization
            document.set_lemmatization_result(analysis.get_tokens())

            # "Sentencing" (splitting in sentences)
            document.set_sentencer_result(analysis.get_sentence_boundaries())

            # POS tagging
            # document.set_tagging_result(analysis.get_pos())

            # Syntactic parsing
            # document.set_syntactic_parsing_result(analysis.get_parse_trees())

            # not used in brat
            # # NER
            # found_entities = analysis.get_found_entities(
            #     document.human_identifier, self.gazette_manager
            # )
            # document.set_ner_result(found_entities)

            # Save progress so far, next step doesn't modify `document`
            document.save()
            analysis.generate_spans_relations()
            document.brat_done_at = datetime.datetime.now()
            document.save()
        else:
            document.jump_signal = 1
            document.save()


if __name__=="__main__":
    print(1)