luojiehua
/
iepy-develop


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
							import itertools
import os
import os.path
import logging

from nltk.tag.stanford import StanfordNERTagger
import wget

from iepy.preprocess.ner.base import BaseNERRunner
from iepy.utils import DIRS, unzip_file

logger = logging.getLogger(__name__)
stanford_ner_name = 'stanford-ner-2014-01-04'
download_url_base = 'http://nlp.stanford.edu/software/'


class NonTokenizingNERTagger(StanfordNERTagger):

    @property
    def _cmd(self):
        old = super(NonTokenizingNERTagger, self)._cmd
        old = old + ["-tokenizerFactory", "edu.stanford.nlp.process.WhitespaceTokenizer"]
        return old


class NERRunner(BaseNERRunner):
    """Wrapper to insert a generic callable sentence NER tagger into the pipeline.
    """
    def __init__(self, ner, override=False):
        super(NERRunner, self).__init__(override=override)
        self.ner = ner

    def run_ner(self, doc):
        entities = []
        # Apply the ner algorithm which takes a list of sentences and returns
        # a list of sentences, each being a list of NER-tokens, each of which is
        # a pairs (tokenstring, class)
        ner_sentences = self.ner(doc.get_sentences())
        # Flatten the nested list above into just a list of kinds
        ner_kinds = (k for s in ner_sentences for (_, k) in s)

        # We build a large iterator z that goes over tuples like the following:
        #  (offset, (token, kind))
        # offset just goes incrementally from 0

        z = itertools.chain(
            enumerate(zip(doc.tokens, ner_kinds)),
            # Add a sentinel last token to simplify last iteration of loop below
            [(len(doc.tokens), (None, 'INVALID'))]
        )

        # Traverse z, looking for changes in the kind field. If there is a
        # change of kind, we have a new set of contiguous tokens; if the kind
        # of those isn't "O" (which means "other"), record the occurrence
        #
        # offset keeps the start of the current token run; last_kind keeps the kind.
        last_kind = 'O'
        offset = 0
        for i, (token, kind) in z:
            if kind != last_kind:
                if last_kind != 'O':
                    # Found a new entity in offset:i
                    name = ' '.join(doc.tokens[offset:i])
                    entities.append(
                        self.build_occurrence(name, last_kind.lower(), name, offset, i)
                    )
                # Restart offset counter at each change of entity type
                offset = i
            last_kind = kind

        # Just a sanity check: verify that all NER tokens were consumed
        try:
            next(ner_kinds)
            assert False, "ner_kinds should have been completely consumed"
        except StopIteration:
            # Actually the stop iteration is the expected result here
            pass

        return entities


class StanfordNERRunner(NERRunner):

    def __init__(self, override=False):
        ner_path = os.path.join(DIRS.user_data_dir, stanford_ner_name)
        if not os.path.exists(ner_path):
            raise LookupError("Stanford NER not found. Try running the "
                              "command download_third_party_data.py")

        ner = NonTokenizingNERTagger(
            os.path.join(ner_path, 'classifiers', 'english.all.3class.distsim.crf.ser.gz'),
            os.path.join(ner_path, 'stanford-ner.jar'),
            encoding='utf8')

        super(StanfordNERRunner, self).__init__(ner.tag_sents, override)


def download():
    logger.info("Downloading Stanford NER...")
    try:
        StanfordNERRunner()
    except LookupError:
        # Package not found, lets download and install it
        if not os.path.exists(DIRS.user_data_dir):
            os.mkdir(DIRS.user_data_dir)
        os.chdir(DIRS.user_data_dir)
        package_filename = '{0}.zip'.format(stanford_ner_name)
        zip_path = os.path.join(DIRS.user_data_dir, package_filename)
        wget.download(download_url_base + package_filename)
        unzip_file(zip_path, DIRS.user_data_dir)
    else:
        logger.info("Stanford NER is already downloaded and functional.")