123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112 |
- import itertools
- import os
- import os.path
- import logging
- from nltk.tag.stanford import StanfordNERTagger
- import wget
- from iepy.preprocess.ner.base import BaseNERRunner
- from iepy.utils import DIRS, unzip_file
- logger = logging.getLogger(__name__)
- stanford_ner_name = 'stanford-ner-2014-01-04'
- download_url_base = 'http://nlp.stanford.edu/software/'
- class NonTokenizingNERTagger(StanfordNERTagger):
- @property
- def _cmd(self):
- old = super(NonTokenizingNERTagger, self)._cmd
- old = old + ["-tokenizerFactory", "edu.stanford.nlp.process.WhitespaceTokenizer"]
- return old
- class NERRunner(BaseNERRunner):
- """Wrapper to insert a generic callable sentence NER tagger into the pipeline.
- """
- def __init__(self, ner, override=False):
- super(NERRunner, self).__init__(override=override)
- self.ner = ner
- def run_ner(self, doc):
- entities = []
- # Apply the ner algorithm which takes a list of sentences and returns
- # a list of sentences, each being a list of NER-tokens, each of which is
- # a pairs (tokenstring, class)
- ner_sentences = self.ner(doc.get_sentences())
- # Flatten the nested list above into just a list of kinds
- ner_kinds = (k for s in ner_sentences for (_, k) in s)
- # We build a large iterator z that goes over tuples like the following:
- # (offset, (token, kind))
- # offset just goes incrementally from 0
- z = itertools.chain(
- enumerate(zip(doc.tokens, ner_kinds)),
- # Add a sentinel last token to simplify last iteration of loop below
- [(len(doc.tokens), (None, 'INVALID'))]
- )
- # Traverse z, looking for changes in the kind field. If there is a
- # change of kind, we have a new set of contiguous tokens; if the kind
- # of those isn't "O" (which means "other"), record the occurrence
- #
- # offset keeps the start of the current token run; last_kind keeps the kind.
- last_kind = 'O'
- offset = 0
- for i, (token, kind) in z:
- if kind != last_kind:
- if last_kind != 'O':
- # Found a new entity in offset:i
- name = ' '.join(doc.tokens[offset:i])
- entities.append(
- self.build_occurrence(name, last_kind.lower(), name, offset, i)
- )
- # Restart offset counter at each change of entity type
- offset = i
- last_kind = kind
- # Just a sanity check: verify that all NER tokens were consumed
- try:
- next(ner_kinds)
- assert False, "ner_kinds should have been completely consumed"
- except StopIteration:
- # Actually the stop iteration is the expected result here
- pass
- return entities
- class StanfordNERRunner(NERRunner):
- def __init__(self, override=False):
- ner_path = os.path.join(DIRS.user_data_dir, stanford_ner_name)
- if not os.path.exists(ner_path):
- raise LookupError("Stanford NER not found. Try running the "
- "command download_third_party_data.py")
- ner = NonTokenizingNERTagger(
- os.path.join(ner_path, 'classifiers', 'english.all.3class.distsim.crf.ser.gz'),
- os.path.join(ner_path, 'stanford-ner.jar'),
- encoding='utf8')
- super(StanfordNERRunner, self).__init__(ner.tag_sents, override)
- def download():
- logger.info("Downloading Stanford NER...")
- try:
- StanfordNERRunner()
- except LookupError:
- # Package not found, lets download and install it
- if not os.path.exists(DIRS.user_data_dir):
- os.mkdir(DIRS.user_data_dir)
- os.chdir(DIRS.user_data_dir)
- package_filename = '{0}.zip'.format(stanford_ner_name)
- zip_path = os.path.join(DIRS.user_data_dir, package_filename)
- wget.download(download_url_base + package_filename)
- unzip_file(zip_path, DIRS.user_data_dir)
- else:
- logger.info("Stanford NER is already downloaded and functional.")
|