luojiehua
/
iepy-develop


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
							from collections import defaultdict
from itertools import chain, groupby
import logging
import tempfile

from iepy.preprocess import corenlp
from iepy.preprocess.pipeline import BasePreProcessStepRunner, PreProcessSteps
from iepy.preprocess.ner.base import FoundEntity
from iepy.data.models import EntityOccurrence, GazetteItem


logger = logging.getLogger(__name__)


class CoreferenceError(Exception):
    pass


class GazetteManager:
    _PREFIX = "__GAZETTE_"

    # Stanford NER default/native classes
    NATIVE_CLASSES = [
        'DATE', 'DURATION', 'LOCATION', 'MISC',
        'MONEY', 'NUMBER', 'ORDINAL', 'ORGANIZATION',
        'PERCENT', 'PERSON', 'SET', 'TIME',
    ]

    def __init__(self):
        self.gazette_items = list(GazetteItem.objects.all())
        self._cache_per_kind = defaultdict(list)

    def escape_text(self, text):
        text = " ".join("\Q{}\E".format(x) for x in text.split())
        return text

    def strip_kind(self, prefixed_kind):
        return prefixed_kind.split(self._PREFIX, 1)[-1]

    def was_entry_created_by_gazette(self, alias, kind):
        if kind.startswith(self._PREFIX):
            return True
        return alias in self._cache_per_kind[kind]

    def generate_stanford_gazettes_file(self):
        """
        Generates the gazettes file if there's any. Returns
        the filepath in case gazette items where found, else None.

        Note: the Stanford Coreference annotator, only handles Entities of their
        native classes. That's why there's some special management of Gazette items
        of such classes/kinds.
        As a side effect, populates the internal cache with the gazette-items
        that will be passed to Stanford with any of their Native classes (Entity Kinds)
        """
        if not self.gazette_items:
            return

        overridable_classes = ",".join(self.NATIVE_CLASSES)
        self._cache_per_kind = defaultdict(list)

        gazette_format = "{}\t{}\t{}\n"
        _, filepath = tempfile.mkstemp()
        with open(filepath, "w") as gazette_file:
            for gazette in self.gazette_items:
                kname = gazette.kind.name
                if kname in self.NATIVE_CLASSES:
                    # kind will not be escaped, but tokens will be stored on cache
                    self._cache_per_kind[kname].append(gazette.text)
                else:
                    kname = "{}{}".format(self._PREFIX, kname)
                text = self.escape_text(gazette.text)
                line = gazette_format.format(text, kname, overridable_classes)
                gazette_file.write(line)
        return filepath


class StanfordPreprocess(BasePreProcessStepRunner):

    def __init__(self, increment_ner=False):
        super().__init__()
        self.gazette_manager = GazetteManager()
        gazettes_filepath = self.gazette_manager.generate_stanford_gazettes_file()
        self.corenlp = corenlp.get_analizer(gazettes_filepath=gazettes_filepath)
        self.override = False
        self.increment_ner = increment_ner

    def lemmatization_only(self, document):
        """ Run only the lemmatization """
        # Lemmatization was added after the first so we need to support
        # that a document has all the steps done but lemmatization

        analysis = StanfordAnalysis(self.corenlp.analyse(document.text))
        tokens = analysis.get_tokens()
        if document.tokens != tokens:
            raise ValueError(
                "Document changed since last tokenization, "
                "can't add lemmas to it"
            )
        document.set_lemmatization_result(analysis.get_lemmas())
        document.save()

    def syntactic_parsing_only(self, document):
        """ Run only the syntactic parsing """
        # syntactic parsing was added after the first release, so we need to
        # provide the ability of doing just this on documents that
        # have all the steps done but syntactic parsing
        analysis = StanfordAnalysis(self.corenlp.analyse(document.text))
        parse_trees = analysis.get_parse_trees()
        document.set_syntactic_parsing_result(parse_trees)
        document.save()

    def increment_ner_only(self, document):
        """
        Runs NER steps (basic NER and also Gazetter), adding the new found NE.
        """
        analysis = StanfordAnalysis(self.corenlp.analyse(document.text))

        # NER
        found_entities = analysis.get_found_entities(
            document.human_identifier, self.gazette_manager
        )
        document.set_ner_result(found_entities)

        # Save progress so far, next step doesn't modify `document`
        document.save()

        # Coreference resolution
        for coref in analysis.get_coreferences():
            try:
                apply_coreferences(document, coref)
            except CoreferenceError as e:
                logger.warning(e)

    def __call__(self, document):
        """Checks state of the document, and based on the preprocess options,
        # decides what needs to be run, and triggers it.
        """
        steps = [
            PreProcessSteps.tokenization,
            PreProcessSteps.sentencer,
            PreProcessSteps.tagging,
            PreProcessSteps.ner,
            # Steps added after 0.9.1
            PreProcessSteps.lemmatization,
            # Steps added after 0.9.2
            PreProcessSteps.syntactic_parsing,
        ]

        steps_done = set([s for s in steps if document.was_preprocess_step_done(s)])

        if self.override or not steps_done:
            # no matter what's the internal state of the document, or any other option
            # on the StanfordPreprocess, everything need to be run
            self.run_everything(document)
        elif steps_done == set(steps):
            # All steps are already done...
            if self.increment_ner:
                self.increment_ner_only(document)
        else:
            # Dealing with accepting "incremental-running" of preprocess for documents
            # that were preprocessed with some older version of IEPY.

            # "initial_steps" are the ones added up to version 0.9.1, which (at some point)
            # were considered "all available steps".
            initial_steps = steps[:4]
            all_initials_done = set(initial_steps).issubset(steps_done)

            if all_initials_done:
                if PreProcessSteps.lemmatization not in steps_done:
                    self.lemmatization_only(document)
                if PreProcessSteps.syntactic_parsing not in steps_done:
                    self.syntactic_parsing_only(document)
            else:
                # weird combination of steps done. We can't handle that right now
                raise NotImplementedError(
                    "Running with mixed preprocess steps not supported, "
                    "must be 100% StanfordMultiStepRunner"
                )

    def run_everything(self, document):
        analysis = StanfordAnalysis(self.corenlp.analyse(document.text))

        # Tokenization
        tokens = analysis.get_tokens()
        offsets = analysis.get_token_offsets()
        document.set_tokenization_result(list(zip(offsets, tokens)))

        # Lemmatization
        document.set_lemmatization_result(analysis.get_lemmas())

        # "Sentencing" (splitting in sentences)
        document.set_sentencer_result(analysis.get_sentence_boundaries())

        # POS tagging
        document.set_tagging_result(analysis.get_pos())

        # Syntactic parsing
        document.set_syntactic_parsing_result(analysis.get_parse_trees())

        # NER
        found_entities = analysis.get_found_entities(
            document.human_identifier, self.gazette_manager
        )
        document.set_ner_result(found_entities)

        # Save progress so far, next step doesn't modify `document`
        document.save()

        # Coreference resolution
        for coref in analysis.get_coreferences():
            try:
                apply_coreferences(document, coref)
            except CoreferenceError as e:
                logger.warning(e)


def _dict_path(d, *steps):
    """Traverses throuth a dict of dicts.
    Returns always a list. If the object to return is not a list,
    it's encapsulated in one.
    If any of the path steps does not exist, an empty list is returned.
    """
    x = d
    for key in steps:
        try:
            x = x[key]
        except KeyError:
            return []
    if not isinstance(x, list):
        x = [x]
    return x


class StanfordAnalysis:
    """Helper for extracting the information from stanford corenlp output"""

    def __init__(self, data):
        self._data = data
        self.sentences = self.get_sentences()
        self._raw_tokens = list(chain.from_iterable(self.sentences))

    def _get(self, *args):
        return _dict_path(self._data, *args)

    def get_sentences(self):
        result = []
        raw_sentences = self._get("sentences", "sentence")
        for sentence in raw_sentences:
            xs = []
            tokens = _dict_path(sentence, "tokens", "token")
            for t in tokens:
                xs.append(t)
            result.append(xs)
        return result

    def get_sentence_boundaries(self):
        """
        Returns a list with the offsets in tokens where each sentence starts, in
        order. The list contains one extra element at the end containing the total
        number of tokens.
        """
        ys = [0]
        for x in self.sentences:
            y = ys[-1] + len(x)
            ys.append(y)
        return ys

    def get_parse_trees(self):
        result = [x["parse"] for x in self._get("sentences", "sentence")]
        return result

    def get_tokens(self):
        return [x["word"] for x in self._raw_tokens]

    def get_lemmas(self):
        return [x["lemma"] for x in self._raw_tokens]

    def get_token_offsets(self):
        return [int(x["CharacterOffsetBegin"]) for x in self._raw_tokens]

    def get_pos(self):
        return [x["POS"] for x in self._raw_tokens]

    def get_found_entities(self, entity_key_prefix, gazette_manager=None):
        """
        Generates FoundEntity objects for the entities found.
        For all the entities that came from a gazette, joins
        the ones with the same kind.
        """
        found_entities = []
        tokens = self.get_tokens()
        for i, j, kind in self.get_entity_occurrences():
            alias = " ".join(tokens[i:j])

            if gazette_manager is not None:
                from_gazette = gazette_manager.was_entry_created_by_gazette(alias, kind)
            else:
                from_gazette = False

            if from_gazette:
                kind = gazette_manager.strip_kind(kind)
                key = alias
            else:
                key = "{} {} {} {}".format(entity_key_prefix, kind, i, j)

            found_entities.append(FoundEntity(
                key=key,
                kind_name=kind,
                alias=alias,
                offset=i,
                offset_end=j,
                from_gazette=from_gazette
            ))
        return found_entities

    def get_entity_occurrences(self):
        """
        Returns a list of tuples (i, j, kind) such that `i` is the start
        offset of an entity occurrence, `j` is the end offset and `kind` is the
        entity kind of the entity.
        """
        found_entities = []
        offset = 0
        for words in self.sentences:
            for kind, group in groupby(enumerate(words), key=lambda x: x[1]["NER"]):
                if kind == "O":
                    continue
                ix = [i for i, word in group]
                i = ix[0] + offset
                j = ix[-1] + 1 + offset
                found_entities.append((i, j, kind))
            offset += len(words)
        return found_entities

    def get_coreferences(self):
        """
        Returns a list of lists of tuples (i, j, k) such that `i` is the start
        offset of a reference, `j` is the end offset and `k` is the index of the
        head word within the reference.
        All offsets are in tokens and relative to the start of the document.
        All references within the same list refer to the same entity.
        All references in different lists refer to different entities.
        """
        sentence_offsets = self.get_sentence_boundaries()
        coreferences = []
        for mention in self._get("coreference", "coreference"):
            occurrences = []
            representative = 0
            for r, occurrence in enumerate(_dict_path(mention, "mention")):
                if "@representative" in occurrence:
                    representative = r
                sentence = int(occurrence["sentence"]) - 1
                offset = sentence_offsets[sentence]
                i = int(occurrence["start"]) - 1 + offset
                j = int(occurrence["end"]) - 1 + offset
                k = int(occurrence["head"]) - 1 + offset
                occurrences.append((i, j, k))
            # Occurrences' representative goes in the first position
            k = representative
            occurrences[0], occurrences[k] = occurrences[0], occurrences[k]
            coreferences.append(occurrences)
        return coreferences


def issues_merging_entities(document, entities):
    # Checks is some general preconditions are met before proceeding to merge some
    # entities on a fiven document
    kinds = set(e.kind for e in entities)
    if len(kinds) != 1:
        return "Cannot merge entities of different kinds {!r}".format(kinds)
    gazettes = set(e.gazette for e in entities if e.gazette)
    if len(gazettes) > 1:
        return "Cannot merge entities of different gazette items {!r}".format(gazettes)


def apply_coreferences(document, coreferences):
    """
    Makes all entity ocurrences listed in `coreferences` have the same
    entity.
    It uses coreference information to merge entity ocurrence's
    entities into a single entity.
    `correferences` is a list of tuples (i, j, head) where:
     - `i` is the offset in tokens where the occurrence starts.
     - `j` is the offset in tokens where the occurrence ends.
     - `head` is the index in tokens of the head of the occurrence (the "most
        important word").

    Every entity occurrence in `coreference` might already exist or not in
    `document`. If no occurrence exists in `document` then nothing is done.
    If at least one ocurrence exists in `document` then all other ocurrences
    named in `coreference` are automatically created.

    This function can raise CofererenceError in case a merge is attempted on
    entities of different kinds.
    """
    # For each token index make a list of the occurrences there
    occurrences = defaultdict(list)
    for occurrence in document.entity_occurrences.all():
        for i in range(occurrence.offset, occurrence.offset_end):
            occurrences[i].append(occurrence)

    entities = []  # Existing entities referenced by correferences
    pickable_as_representant = []
    missing = []  # References that have no entity occurrence yet
    for i, j, head in sorted(coreferences):
        if occurrences[head]:
            for x in occurrences[head]:
                entities.append(x.entity)
                if not x.anaphora:
                    pickable_as_representant.append(x.entity)
        else:
            missing.append((i, j, head))

    if not pickable_as_representant:
        return
    issues = issues_merging_entities(document, entities)
    if issues:
        raise CoreferenceError(issues)

    from_ner = [r for r in pickable_as_representant if not r.gazette]
    if from_ner:
        canonical = from_ner[0]
    else:
        canonical = pickable_as_representant[0]

    # Each missing coreference needs to be created into an occurrence now
    for i, j, head in missing:
        if j - i >= 5:  # If the entity is a long phrase then just keep one token
            i = head
            j = head + 1
        EntityOccurrence.objects.get_or_create(
            document=document,
            entity=canonical,
            offset=i,
            offset_end=j,
            alias=" ".join(document.tokens[i:j]),
            defaults={'anaphora': True})

    # Finally, the merging 'per se', where all things are entity occurrences
    for entity in set(x for x in entities if x != canonical):
        for occurrence in EntityOccurrence.objects.filter(entity=entity,
                                                          document=document):
            occurrence.entity = canonical
            occurrence.save()