stanford.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. import itertools
  2. import os
  3. import os.path
  4. import logging
  5. from nltk.tag.stanford import StanfordNERTagger
  6. import wget
  7. from iepy.preprocess.ner.base import BaseNERRunner
  8. from iepy.utils import DIRS, unzip_file
  9. logger = logging.getLogger(__name__)
  10. stanford_ner_name = 'stanford-ner-2014-01-04'
  11. download_url_base = 'http://nlp.stanford.edu/software/'
  12. class NonTokenizingNERTagger(StanfordNERTagger):
  13. @property
  14. def _cmd(self):
  15. old = super(NonTokenizingNERTagger, self)._cmd
  16. old = old + ["-tokenizerFactory", "edu.stanford.nlp.process.WhitespaceTokenizer"]
  17. return old
  18. class NERRunner(BaseNERRunner):
  19. """Wrapper to insert a generic callable sentence NER tagger into the pipeline.
  20. """
  21. def __init__(self, ner, override=False):
  22. super(NERRunner, self).__init__(override=override)
  23. self.ner = ner
  24. def run_ner(self, doc):
  25. entities = []
  26. # Apply the ner algorithm which takes a list of sentences and returns
  27. # a list of sentences, each being a list of NER-tokens, each of which is
  28. # a pairs (tokenstring, class)
  29. ner_sentences = self.ner(doc.get_sentences())
  30. # Flatten the nested list above into just a list of kinds
  31. ner_kinds = (k for s in ner_sentences for (_, k) in s)
  32. # We build a large iterator z that goes over tuples like the following:
  33. # (offset, (token, kind))
  34. # offset just goes incrementally from 0
  35. z = itertools.chain(
  36. enumerate(zip(doc.tokens, ner_kinds)),
  37. # Add a sentinel last token to simplify last iteration of loop below
  38. [(len(doc.tokens), (None, 'INVALID'))]
  39. )
  40. # Traverse z, looking for changes in the kind field. If there is a
  41. # change of kind, we have a new set of contiguous tokens; if the kind
  42. # of those isn't "O" (which means "other"), record the occurrence
  43. #
  44. # offset keeps the start of the current token run; last_kind keeps the kind.
  45. last_kind = 'O'
  46. offset = 0
  47. for i, (token, kind) in z:
  48. if kind != last_kind:
  49. if last_kind != 'O':
  50. # Found a new entity in offset:i
  51. name = ' '.join(doc.tokens[offset:i])
  52. entities.append(
  53. self.build_occurrence(name, last_kind.lower(), name, offset, i)
  54. )
  55. # Restart offset counter at each change of entity type
  56. offset = i
  57. last_kind = kind
  58. # Just a sanity check: verify that all NER tokens were consumed
  59. try:
  60. next(ner_kinds)
  61. assert False, "ner_kinds should have been completely consumed"
  62. except StopIteration:
  63. # Actually the stop iteration is the expected result here
  64. pass
  65. return entities
  66. class StanfordNERRunner(NERRunner):
  67. def __init__(self, override=False):
  68. ner_path = os.path.join(DIRS.user_data_dir, stanford_ner_name)
  69. if not os.path.exists(ner_path):
  70. raise LookupError("Stanford NER not found. Try running the "
  71. "command download_third_party_data.py")
  72. ner = NonTokenizingNERTagger(
  73. os.path.join(ner_path, 'classifiers', 'english.all.3class.distsim.crf.ser.gz'),
  74. os.path.join(ner_path, 'stanford-ner.jar'),
  75. encoding='utf8')
  76. super(StanfordNERRunner, self).__init__(ner.tag_sents, override)
  77. def download():
  78. logger.info("Downloading Stanford NER...")
  79. try:
  80. StanfordNERRunner()
  81. except LookupError:
  82. # Package not found, lets download and install it
  83. if not os.path.exists(DIRS.user_data_dir):
  84. os.mkdir(DIRS.user_data_dir)
  85. os.chdir(DIRS.user_data_dir)
  86. package_filename = '{0}.zip'.format(stanford_ner_name)
  87. zip_path = os.path.join(DIRS.user_data_dir, package_filename)
  88. wget.download(download_url_base + package_filename)
  89. unzip_file(zip_path, DIRS.user_data_dir)
  90. else:
  91. logger.info("Stanford NER is already downloaded and functional.")