stanford_preprocess.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. from collections import defaultdict
  2. from itertools import chain, groupby
  3. import logging
  4. import tempfile
  5. from iepy.preprocess import corenlp
  6. from iepy.preprocess.pipeline import BasePreProcessStepRunner, PreProcessSteps
  7. from iepy.preprocess.ner.base import FoundEntity
  8. from iepy.data.models import EntityOccurrence, GazetteItem
  9. logger = logging.getLogger(__name__)
  10. class CoreferenceError(Exception):
  11. pass
  12. class GazetteManager:
  13. _PREFIX = "__GAZETTE_"
  14. # Stanford NER default/native classes
  15. NATIVE_CLASSES = [
  16. 'DATE', 'DURATION', 'LOCATION', 'MISC',
  17. 'MONEY', 'NUMBER', 'ORDINAL', 'ORGANIZATION',
  18. 'PERCENT', 'PERSON', 'SET', 'TIME',
  19. ]
  20. def __init__(self):
  21. self.gazette_items = list(GazetteItem.objects.all())
  22. self._cache_per_kind = defaultdict(list)
  23. def escape_text(self, text):
  24. text = " ".join("\Q{}\E".format(x) for x in text.split())
  25. return text
  26. def strip_kind(self, prefixed_kind):
  27. return prefixed_kind.split(self._PREFIX, 1)[-1]
  28. def was_entry_created_by_gazette(self, alias, kind):
  29. if kind.startswith(self._PREFIX):
  30. return True
  31. return alias in self._cache_per_kind[kind]
  32. def generate_stanford_gazettes_file(self):
  33. """
  34. Generates the gazettes file if there's any. Returns
  35. the filepath in case gazette items where found, else None.
  36. Note: the Stanford Coreference annotator, only handles Entities of their
  37. native classes. That's why there's some special management of Gazette items
  38. of such classes/kinds.
  39. As a side effect, populates the internal cache with the gazette-items
  40. that will be passed to Stanford with any of their Native classes (Entity Kinds)
  41. """
  42. if not self.gazette_items:
  43. return
  44. overridable_classes = ",".join(self.NATIVE_CLASSES)
  45. self._cache_per_kind = defaultdict(list)
  46. gazette_format = "{}\t{}\t{}\n"
  47. _, filepath = tempfile.mkstemp()
  48. with open(filepath, "w") as gazette_file:
  49. for gazette in self.gazette_items:
  50. kname = gazette.kind.name
  51. if kname in self.NATIVE_CLASSES:
  52. # kind will not be escaped, but tokens will be stored on cache
  53. self._cache_per_kind[kname].append(gazette.text)
  54. else:
  55. kname = "{}{}".format(self._PREFIX, kname)
  56. text = self.escape_text(gazette.text)
  57. line = gazette_format.format(text, kname, overridable_classes)
  58. gazette_file.write(line)
  59. return filepath
  60. class StanfordPreprocess(BasePreProcessStepRunner):
  61. def __init__(self, increment_ner=False):
  62. super().__init__()
  63. self.gazette_manager = GazetteManager()
  64. gazettes_filepath = self.gazette_manager.generate_stanford_gazettes_file()
  65. self.corenlp = corenlp.get_analizer(gazettes_filepath=gazettes_filepath)
  66. self.override = False
  67. self.increment_ner = increment_ner
  68. def lemmatization_only(self, document):
  69. """ Run only the lemmatization """
  70. # Lemmatization was added after the first so we need to support
  71. # that a document has all the steps done but lemmatization
  72. analysis = StanfordAnalysis(self.corenlp.analyse(document.text))
  73. tokens = analysis.get_tokens()
  74. if document.tokens != tokens:
  75. raise ValueError(
  76. "Document changed since last tokenization, "
  77. "can't add lemmas to it"
  78. )
  79. document.set_lemmatization_result(analysis.get_lemmas())
  80. document.save()
  81. def syntactic_parsing_only(self, document):
  82. """ Run only the syntactic parsing """
  83. # syntactic parsing was added after the first release, so we need to
  84. # provide the ability of doing just this on documents that
  85. # have all the steps done but syntactic parsing
  86. analysis = StanfordAnalysis(self.corenlp.analyse(document.text))
  87. parse_trees = analysis.get_parse_trees()
  88. document.set_syntactic_parsing_result(parse_trees)
  89. document.save()
  90. def increment_ner_only(self, document):
  91. """
  92. Runs NER steps (basic NER and also Gazetter), adding the new found NE.
  93. """
  94. analysis = StanfordAnalysis(self.corenlp.analyse(document.text))
  95. # NER
  96. found_entities = analysis.get_found_entities(
  97. document.human_identifier, self.gazette_manager
  98. )
  99. document.set_ner_result(found_entities)
  100. # Save progress so far, next step doesn't modify `document`
  101. document.save()
  102. # Coreference resolution
  103. for coref in analysis.get_coreferences():
  104. try:
  105. apply_coreferences(document, coref)
  106. except CoreferenceError as e:
  107. logger.warning(e)
  108. def __call__(self, document):
  109. """Checks state of the document, and based on the preprocess options,
  110. # decides what needs to be run, and triggers it.
  111. """
  112. steps = [
  113. PreProcessSteps.tokenization,
  114. PreProcessSteps.sentencer,
  115. PreProcessSteps.tagging,
  116. PreProcessSteps.ner,
  117. # Steps added after 0.9.1
  118. PreProcessSteps.lemmatization,
  119. # Steps added after 0.9.2
  120. PreProcessSteps.syntactic_parsing,
  121. ]
  122. steps_done = set([s for s in steps if document.was_preprocess_step_done(s)])
  123. if self.override or not steps_done:
  124. # no matter what's the internal state of the document, or any other option
  125. # on the StanfordPreprocess, everything need to be run
  126. self.run_everything(document)
  127. elif steps_done == set(steps):
  128. # All steps are already done...
  129. if self.increment_ner:
  130. self.increment_ner_only(document)
  131. else:
  132. # Dealing with accepting "incremental-running" of preprocess for documents
  133. # that were preprocessed with some older version of IEPY.
  134. # "initial_steps" are the ones added up to version 0.9.1, which (at some point)
  135. # were considered "all available steps".
  136. initial_steps = steps[:4]
  137. all_initials_done = set(initial_steps).issubset(steps_done)
  138. if all_initials_done:
  139. if PreProcessSteps.lemmatization not in steps_done:
  140. self.lemmatization_only(document)
  141. if PreProcessSteps.syntactic_parsing not in steps_done:
  142. self.syntactic_parsing_only(document)
  143. else:
  144. # weird combination of steps done. We can't handle that right now
  145. raise NotImplementedError(
  146. "Running with mixed preprocess steps not supported, "
  147. "must be 100% StanfordMultiStepRunner"
  148. )
  149. def run_everything(self, document):
  150. analysis = StanfordAnalysis(self.corenlp.analyse(document.text))
  151. # Tokenization
  152. tokens = analysis.get_tokens()
  153. offsets = analysis.get_token_offsets()
  154. document.set_tokenization_result(list(zip(offsets, tokens)))
  155. # Lemmatization
  156. document.set_lemmatization_result(analysis.get_lemmas())
  157. # "Sentencing" (splitting in sentences)
  158. document.set_sentencer_result(analysis.get_sentence_boundaries())
  159. # POS tagging
  160. document.set_tagging_result(analysis.get_pos())
  161. # Syntactic parsing
  162. document.set_syntactic_parsing_result(analysis.get_parse_trees())
  163. # NER
  164. found_entities = analysis.get_found_entities(
  165. document.human_identifier, self.gazette_manager
  166. )
  167. document.set_ner_result(found_entities)
  168. # Save progress so far, next step doesn't modify `document`
  169. document.save()
  170. # Coreference resolution
  171. for coref in analysis.get_coreferences():
  172. try:
  173. apply_coreferences(document, coref)
  174. except CoreferenceError as e:
  175. logger.warning(e)
  176. def _dict_path(d, *steps):
  177. """Traverses throuth a dict of dicts.
  178. Returns always a list. If the object to return is not a list,
  179. it's encapsulated in one.
  180. If any of the path steps does not exist, an empty list is returned.
  181. """
  182. x = d
  183. for key in steps:
  184. try:
  185. x = x[key]
  186. except KeyError:
  187. return []
  188. if not isinstance(x, list):
  189. x = [x]
  190. return x
  191. class StanfordAnalysis:
  192. """Helper for extracting the information from stanford corenlp output"""
  193. def __init__(self, data):
  194. self._data = data
  195. self.sentences = self.get_sentences()
  196. self._raw_tokens = list(chain.from_iterable(self.sentences))
  197. def _get(self, *args):
  198. return _dict_path(self._data, *args)
  199. def get_sentences(self):
  200. result = []
  201. raw_sentences = self._get("sentences", "sentence")
  202. for sentence in raw_sentences:
  203. xs = []
  204. tokens = _dict_path(sentence, "tokens", "token")
  205. for t in tokens:
  206. xs.append(t)
  207. result.append(xs)
  208. return result
  209. def get_sentence_boundaries(self):
  210. """
  211. Returns a list with the offsets in tokens where each sentence starts, in
  212. order. The list contains one extra element at the end containing the total
  213. number of tokens.
  214. """
  215. ys = [0]
  216. for x in self.sentences:
  217. y = ys[-1] + len(x)
  218. ys.append(y)
  219. return ys
  220. def get_parse_trees(self):
  221. result = [x["parse"] for x in self._get("sentences", "sentence")]
  222. return result
  223. def get_tokens(self):
  224. return [x["word"] for x in self._raw_tokens]
  225. def get_lemmas(self):
  226. return [x["lemma"] for x in self._raw_tokens]
  227. def get_token_offsets(self):
  228. return [int(x["CharacterOffsetBegin"]) for x in self._raw_tokens]
  229. def get_pos(self):
  230. return [x["POS"] for x in self._raw_tokens]
  231. def get_found_entities(self, entity_key_prefix, gazette_manager=None):
  232. """
  233. Generates FoundEntity objects for the entities found.
  234. For all the entities that came from a gazette, joins
  235. the ones with the same kind.
  236. """
  237. found_entities = []
  238. tokens = self.get_tokens()
  239. for i, j, kind in self.get_entity_occurrences():
  240. alias = " ".join(tokens[i:j])
  241. if gazette_manager is not None:
  242. from_gazette = gazette_manager.was_entry_created_by_gazette(alias, kind)
  243. else:
  244. from_gazette = False
  245. if from_gazette:
  246. kind = gazette_manager.strip_kind(kind)
  247. key = alias
  248. else:
  249. key = "{} {} {} {}".format(entity_key_prefix, kind, i, j)
  250. found_entities.append(FoundEntity(
  251. key=key,
  252. kind_name=kind,
  253. alias=alias,
  254. offset=i,
  255. offset_end=j,
  256. from_gazette=from_gazette
  257. ))
  258. return found_entities
  259. def get_entity_occurrences(self):
  260. """
  261. Returns a list of tuples (i, j, kind) such that `i` is the start
  262. offset of an entity occurrence, `j` is the end offset and `kind` is the
  263. entity kind of the entity.
  264. """
  265. found_entities = []
  266. offset = 0
  267. for words in self.sentences:
  268. for kind, group in groupby(enumerate(words), key=lambda x: x[1]["NER"]):
  269. if kind == "O":
  270. continue
  271. ix = [i for i, word in group]
  272. i = ix[0] + offset
  273. j = ix[-1] + 1 + offset
  274. found_entities.append((i, j, kind))
  275. offset += len(words)
  276. return found_entities
  277. def get_coreferences(self):
  278. """
  279. Returns a list of lists of tuples (i, j, k) such that `i` is the start
  280. offset of a reference, `j` is the end offset and `k` is the index of the
  281. head word within the reference.
  282. All offsets are in tokens and relative to the start of the document.
  283. All references within the same list refer to the same entity.
  284. All references in different lists refer to different entities.
  285. """
  286. sentence_offsets = self.get_sentence_boundaries()
  287. coreferences = []
  288. for mention in self._get("coreference", "coreference"):
  289. occurrences = []
  290. representative = 0
  291. for r, occurrence in enumerate(_dict_path(mention, "mention")):
  292. if "@representative" in occurrence:
  293. representative = r
  294. sentence = int(occurrence["sentence"]) - 1
  295. offset = sentence_offsets[sentence]
  296. i = int(occurrence["start"]) - 1 + offset
  297. j = int(occurrence["end"]) - 1 + offset
  298. k = int(occurrence["head"]) - 1 + offset
  299. occurrences.append((i, j, k))
  300. # Occurrences' representative goes in the first position
  301. k = representative
  302. occurrences[0], occurrences[k] = occurrences[0], occurrences[k]
  303. coreferences.append(occurrences)
  304. return coreferences
  305. def issues_merging_entities(document, entities):
  306. # Checks is some general preconditions are met before proceeding to merge some
  307. # entities on a fiven document
  308. kinds = set(e.kind for e in entities)
  309. if len(kinds) != 1:
  310. return "Cannot merge entities of different kinds {!r}".format(kinds)
  311. gazettes = set(e.gazette for e in entities if e.gazette)
  312. if len(gazettes) > 1:
  313. return "Cannot merge entities of different gazette items {!r}".format(gazettes)
  314. def apply_coreferences(document, coreferences):
  315. """
  316. Makes all entity ocurrences listed in `coreferences` have the same
  317. entity.
  318. It uses coreference information to merge entity ocurrence's
  319. entities into a single entity.
  320. `correferences` is a list of tuples (i, j, head) where:
  321. - `i` is the offset in tokens where the occurrence starts.
  322. - `j` is the offset in tokens where the occurrence ends.
  323. - `head` is the index in tokens of the head of the occurrence (the "most
  324. important word").
  325. Every entity occurrence in `coreference` might already exist or not in
  326. `document`. If no occurrence exists in `document` then nothing is done.
  327. If at least one ocurrence exists in `document` then all other ocurrences
  328. named in `coreference` are automatically created.
  329. This function can raise CofererenceError in case a merge is attempted on
  330. entities of different kinds.
  331. """
  332. # For each token index make a list of the occurrences there
  333. occurrences = defaultdict(list)
  334. for occurrence in document.entity_occurrences.all():
  335. for i in range(occurrence.offset, occurrence.offset_end):
  336. occurrences[i].append(occurrence)
  337. entities = [] # Existing entities referenced by correferences
  338. pickable_as_representant = []
  339. missing = [] # References that have no entity occurrence yet
  340. for i, j, head in sorted(coreferences):
  341. if occurrences[head]:
  342. for x in occurrences[head]:
  343. entities.append(x.entity)
  344. if not x.anaphora:
  345. pickable_as_representant.append(x.entity)
  346. else:
  347. missing.append((i, j, head))
  348. if not pickable_as_representant:
  349. return
  350. issues = issues_merging_entities(document, entities)
  351. if issues:
  352. raise CoreferenceError(issues)
  353. from_ner = [r for r in pickable_as_representant if not r.gazette]
  354. if from_ner:
  355. canonical = from_ner[0]
  356. else:
  357. canonical = pickable_as_representant[0]
  358. # Each missing coreference needs to be created into an occurrence now
  359. for i, j, head in missing:
  360. if j - i >= 5: # If the entity is a long phrase then just keep one token
  361. i = head
  362. j = head + 1
  363. EntityOccurrence.objects.get_or_create(
  364. document=document,
  365. entity=canonical,
  366. offset=i,
  367. offset_end=j,
  368. alias=" ".join(document.tokens[i:j]),
  369. defaults={'anaphora': True})
  370. # Finally, the merging 'per se', where all things are entity occurrences
  371. for entity in set(x for x in entities if x != canonical):
  372. for occurrence in EntityOccurrence.objects.filter(entity=entity,
  373. document=document):
  374. occurrence.entity = canonical
  375. occurrence.save()