123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445 |
- from collections import defaultdict
- from itertools import chain, groupby
- import logging
- import tempfile
- from iepy.preprocess import corenlp
- from iepy.preprocess.pipeline import BasePreProcessStepRunner, PreProcessSteps
- from iepy.preprocess.ner.base import FoundEntity
- from iepy.data.models import EntityOccurrence, GazetteItem
- logger = logging.getLogger(__name__)
- class CoreferenceError(Exception):
- pass
- class GazetteManager:
- _PREFIX = "__GAZETTE_"
- # Stanford NER default/native classes
- NATIVE_CLASSES = [
- 'DATE', 'DURATION', 'LOCATION', 'MISC',
- 'MONEY', 'NUMBER', 'ORDINAL', 'ORGANIZATION',
- 'PERCENT', 'PERSON', 'SET', 'TIME',
- ]
- def __init__(self):
- self.gazette_items = list(GazetteItem.objects.all())
- self._cache_per_kind = defaultdict(list)
- def escape_text(self, text):
- text = " ".join("\Q{}\E".format(x) for x in text.split())
- return text
- def strip_kind(self, prefixed_kind):
- return prefixed_kind.split(self._PREFIX, 1)[-1]
- def was_entry_created_by_gazette(self, alias, kind):
- if kind.startswith(self._PREFIX):
- return True
- return alias in self._cache_per_kind[kind]
- def generate_stanford_gazettes_file(self):
- """
- Generates the gazettes file if there's any. Returns
- the filepath in case gazette items where found, else None.
- Note: the Stanford Coreference annotator, only handles Entities of their
- native classes. That's why there's some special management of Gazette items
- of such classes/kinds.
- As a side effect, populates the internal cache with the gazette-items
- that will be passed to Stanford with any of their Native classes (Entity Kinds)
- """
- if not self.gazette_items:
- return
- overridable_classes = ",".join(self.NATIVE_CLASSES)
- self._cache_per_kind = defaultdict(list)
- gazette_format = "{}\t{}\t{}\n"
- _, filepath = tempfile.mkstemp()
- with open(filepath, "w") as gazette_file:
- for gazette in self.gazette_items:
- kname = gazette.kind.name
- if kname in self.NATIVE_CLASSES:
- # kind will not be escaped, but tokens will be stored on cache
- self._cache_per_kind[kname].append(gazette.text)
- else:
- kname = "{}{}".format(self._PREFIX, kname)
- text = self.escape_text(gazette.text)
- line = gazette_format.format(text, kname, overridable_classes)
- gazette_file.write(line)
- return filepath
- class StanfordPreprocess(BasePreProcessStepRunner):
- def __init__(self, increment_ner=False):
- super().__init__()
- self.gazette_manager = GazetteManager()
- gazettes_filepath = self.gazette_manager.generate_stanford_gazettes_file()
- self.corenlp = corenlp.get_analizer(gazettes_filepath=gazettes_filepath)
- self.override = False
- self.increment_ner = increment_ner
- def lemmatization_only(self, document):
- """ Run only the lemmatization """
- # Lemmatization was added after the first so we need to support
- # that a document has all the steps done but lemmatization
- analysis = StanfordAnalysis(self.corenlp.analyse(document.text))
- tokens = analysis.get_tokens()
- if document.tokens != tokens:
- raise ValueError(
- "Document changed since last tokenization, "
- "can't add lemmas to it"
- )
- document.set_lemmatization_result(analysis.get_lemmas())
- document.save()
- def syntactic_parsing_only(self, document):
- """ Run only the syntactic parsing """
- # syntactic parsing was added after the first release, so we need to
- # provide the ability of doing just this on documents that
- # have all the steps done but syntactic parsing
- analysis = StanfordAnalysis(self.corenlp.analyse(document.text))
- parse_trees = analysis.get_parse_trees()
- document.set_syntactic_parsing_result(parse_trees)
- document.save()
- def increment_ner_only(self, document):
- """
- Runs NER steps (basic NER and also Gazetter), adding the new found NE.
- """
- analysis = StanfordAnalysis(self.corenlp.analyse(document.text))
- # NER
- found_entities = analysis.get_found_entities(
- document.human_identifier, self.gazette_manager
- )
- document.set_ner_result(found_entities)
- # Save progress so far, next step doesn't modify `document`
- document.save()
- # Coreference resolution
- for coref in analysis.get_coreferences():
- try:
- apply_coreferences(document, coref)
- except CoreferenceError as e:
- logger.warning(e)
- def __call__(self, document):
- """Checks state of the document, and based on the preprocess options,
- # decides what needs to be run, and triggers it.
- """
- steps = [
- PreProcessSteps.tokenization,
- PreProcessSteps.sentencer,
- PreProcessSteps.tagging,
- PreProcessSteps.ner,
- # Steps added after 0.9.1
- PreProcessSteps.lemmatization,
- # Steps added after 0.9.2
- PreProcessSteps.syntactic_parsing,
- ]
- steps_done = set([s for s in steps if document.was_preprocess_step_done(s)])
- if self.override or not steps_done:
- # no matter what's the internal state of the document, or any other option
- # on the StanfordPreprocess, everything need to be run
- self.run_everything(document)
- elif steps_done == set(steps):
- # All steps are already done...
- if self.increment_ner:
- self.increment_ner_only(document)
- else:
- # Dealing with accepting "incremental-running" of preprocess for documents
- # that were preprocessed with some older version of IEPY.
- # "initial_steps" are the ones added up to version 0.9.1, which (at some point)
- # were considered "all available steps".
- initial_steps = steps[:4]
- all_initials_done = set(initial_steps).issubset(steps_done)
- if all_initials_done:
- if PreProcessSteps.lemmatization not in steps_done:
- self.lemmatization_only(document)
- if PreProcessSteps.syntactic_parsing not in steps_done:
- self.syntactic_parsing_only(document)
- else:
- # weird combination of steps done. We can't handle that right now
- raise NotImplementedError(
- "Running with mixed preprocess steps not supported, "
- "must be 100% StanfordMultiStepRunner"
- )
- def run_everything(self, document):
- analysis = StanfordAnalysis(self.corenlp.analyse(document.text))
- # Tokenization
- tokens = analysis.get_tokens()
- offsets = analysis.get_token_offsets()
- document.set_tokenization_result(list(zip(offsets, tokens)))
- # Lemmatization
- document.set_lemmatization_result(analysis.get_lemmas())
- # "Sentencing" (splitting in sentences)
- document.set_sentencer_result(analysis.get_sentence_boundaries())
- # POS tagging
- document.set_tagging_result(analysis.get_pos())
- # Syntactic parsing
- document.set_syntactic_parsing_result(analysis.get_parse_trees())
- # NER
- found_entities = analysis.get_found_entities(
- document.human_identifier, self.gazette_manager
- )
- document.set_ner_result(found_entities)
- # Save progress so far, next step doesn't modify `document`
- document.save()
- # Coreference resolution
- for coref in analysis.get_coreferences():
- try:
- apply_coreferences(document, coref)
- except CoreferenceError as e:
- logger.warning(e)
- def _dict_path(d, *steps):
- """Traverses throuth a dict of dicts.
- Returns always a list. If the object to return is not a list,
- it's encapsulated in one.
- If any of the path steps does not exist, an empty list is returned.
- """
- x = d
- for key in steps:
- try:
- x = x[key]
- except KeyError:
- return []
- if not isinstance(x, list):
- x = [x]
- return x
- class StanfordAnalysis:
- """Helper for extracting the information from stanford corenlp output"""
- def __init__(self, data):
- self._data = data
- self.sentences = self.get_sentences()
- self._raw_tokens = list(chain.from_iterable(self.sentences))
- def _get(self, *args):
- return _dict_path(self._data, *args)
- def get_sentences(self):
- result = []
- raw_sentences = self._get("sentences", "sentence")
- for sentence in raw_sentences:
- xs = []
- tokens = _dict_path(sentence, "tokens", "token")
- for t in tokens:
- xs.append(t)
- result.append(xs)
- return result
- def get_sentence_boundaries(self):
- """
- Returns a list with the offsets in tokens where each sentence starts, in
- order. The list contains one extra element at the end containing the total
- number of tokens.
- """
- ys = [0]
- for x in self.sentences:
- y = ys[-1] + len(x)
- ys.append(y)
- return ys
- def get_parse_trees(self):
- result = [x["parse"] for x in self._get("sentences", "sentence")]
- return result
- def get_tokens(self):
- return [x["word"] for x in self._raw_tokens]
- def get_lemmas(self):
- return [x["lemma"] for x in self._raw_tokens]
- def get_token_offsets(self):
- return [int(x["CharacterOffsetBegin"]) for x in self._raw_tokens]
- def get_pos(self):
- return [x["POS"] for x in self._raw_tokens]
- def get_found_entities(self, entity_key_prefix, gazette_manager=None):
- """
- Generates FoundEntity objects for the entities found.
- For all the entities that came from a gazette, joins
- the ones with the same kind.
- """
- found_entities = []
- tokens = self.get_tokens()
- for i, j, kind in self.get_entity_occurrences():
- alias = " ".join(tokens[i:j])
- if gazette_manager is not None:
- from_gazette = gazette_manager.was_entry_created_by_gazette(alias, kind)
- else:
- from_gazette = False
- if from_gazette:
- kind = gazette_manager.strip_kind(kind)
- key = alias
- else:
- key = "{} {} {} {}".format(entity_key_prefix, kind, i, j)
- found_entities.append(FoundEntity(
- key=key,
- kind_name=kind,
- alias=alias,
- offset=i,
- offset_end=j,
- from_gazette=from_gazette
- ))
- return found_entities
- def get_entity_occurrences(self):
- """
- Returns a list of tuples (i, j, kind) such that `i` is the start
- offset of an entity occurrence, `j` is the end offset and `kind` is the
- entity kind of the entity.
- """
- found_entities = []
- offset = 0
- for words in self.sentences:
- for kind, group in groupby(enumerate(words), key=lambda x: x[1]["NER"]):
- if kind == "O":
- continue
- ix = [i for i, word in group]
- i = ix[0] + offset
- j = ix[-1] + 1 + offset
- found_entities.append((i, j, kind))
- offset += len(words)
- return found_entities
- def get_coreferences(self):
- """
- Returns a list of lists of tuples (i, j, k) such that `i` is the start
- offset of a reference, `j` is the end offset and `k` is the index of the
- head word within the reference.
- All offsets are in tokens and relative to the start of the document.
- All references within the same list refer to the same entity.
- All references in different lists refer to different entities.
- """
- sentence_offsets = self.get_sentence_boundaries()
- coreferences = []
- for mention in self._get("coreference", "coreference"):
- occurrences = []
- representative = 0
- for r, occurrence in enumerate(_dict_path(mention, "mention")):
- if "@representative" in occurrence:
- representative = r
- sentence = int(occurrence["sentence"]) - 1
- offset = sentence_offsets[sentence]
- i = int(occurrence["start"]) - 1 + offset
- j = int(occurrence["end"]) - 1 + offset
- k = int(occurrence["head"]) - 1 + offset
- occurrences.append((i, j, k))
- # Occurrences' representative goes in the first position
- k = representative
- occurrences[0], occurrences[k] = occurrences[0], occurrences[k]
- coreferences.append(occurrences)
- return coreferences
- def issues_merging_entities(document, entities):
- # Checks is some general preconditions are met before proceeding to merge some
- # entities on a fiven document
- kinds = set(e.kind for e in entities)
- if len(kinds) != 1:
- return "Cannot merge entities of different kinds {!r}".format(kinds)
- gazettes = set(e.gazette for e in entities if e.gazette)
- if len(gazettes) > 1:
- return "Cannot merge entities of different gazette items {!r}".format(gazettes)
- def apply_coreferences(document, coreferences):
- """
- Makes all entity ocurrences listed in `coreferences` have the same
- entity.
- It uses coreference information to merge entity ocurrence's
- entities into a single entity.
- `correferences` is a list of tuples (i, j, head) where:
- - `i` is the offset in tokens where the occurrence starts.
- - `j` is the offset in tokens where the occurrence ends.
- - `head` is the index in tokens of the head of the occurrence (the "most
- important word").
- Every entity occurrence in `coreference` might already exist or not in
- `document`. If no occurrence exists in `document` then nothing is done.
- If at least one ocurrence exists in `document` then all other ocurrences
- named in `coreference` are automatically created.
- This function can raise CofererenceError in case a merge is attempted on
- entities of different kinds.
- """
- # For each token index make a list of the occurrences there
- occurrences = defaultdict(list)
- for occurrence in document.entity_occurrences.all():
- for i in range(occurrence.offset, occurrence.offset_end):
- occurrences[i].append(occurrence)
- entities = [] # Existing entities referenced by correferences
- pickable_as_representant = []
- missing = [] # References that have no entity occurrence yet
- for i, j, head in sorted(coreferences):
- if occurrences[head]:
- for x in occurrences[head]:
- entities.append(x.entity)
- if not x.anaphora:
- pickable_as_representant.append(x.entity)
- else:
- missing.append((i, j, head))
- if not pickable_as_representant:
- return
- issues = issues_merging_entities(document, entities)
- if issues:
- raise CoreferenceError(issues)
- from_ner = [r for r in pickable_as_representant if not r.gazette]
- if from_ner:
- canonical = from_ner[0]
- else:
- canonical = pickable_as_representant[0]
- # Each missing coreference needs to be created into an occurrence now
- for i, j, head in missing:
- if j - i >= 5: # If the entity is a long phrase then just keep one token
- i = head
- j = head + 1
- EntityOccurrence.objects.get_or_create(
- document=document,
- entity=canonical,
- offset=i,
- offset_end=j,
- alias=" ".join(document.tokens[i:j]),
- defaults={'anaphora': True})
- # Finally, the merging 'per se', where all things are entity occurrences
- for entity in set(x for x in entities if x != canonical):
- for occurrence in EntityOccurrence.objects.filter(entity=entity,
- document=document):
- occurrence.entity = canonical
- occurrence.save()
|