luojiehua
/
iepy-develop


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
							# -*- coding: utf-8 -*-
import re

import nltk.data
from nltk.tokenize import RegexpTokenizer

from iepy.preprocess.pipeline import BasePreProcessStepRunner, PreProcessSteps
from iepy.utils import DIRS


class TokenizeSentencerRunner(BasePreProcessStepRunner):
    """Does Tokenization and segmentation togheter over IEDocuments.

    - If override=True, no matter if any of those steps were
    already computed or not, will do and store them on the doc.
    - If override=False, but one of the 2 things is done and the other
    not, will behave completely like in the case of override=True.
    - If override=False and both steps were done on the document, will
    do nothing.
    """
    step = PreProcessSteps.tokenization

    def __init__(self, override=False, increment=False, lang='en'):
        if lang != 'en':
            # We are right now only providing english tokenization
            # and segmentation. But if you need something else, this
            # is a good place to do it.
            raise NotImplemented
        self.lang = lang
        self.override = override
        self.increment = increment
        # we'll be doing 2 PreProcess in one here
        self.tkn_step = PreProcessSteps.tokenization
        self.snt_step = PreProcessSteps.sentencer

    def __call__(self, doc):
        tkn_done = doc.was_preprocess_step_done(self.tkn_step)
        snt_done = doc.was_preprocess_step_done(self.snt_step)
        if self.override or not (tkn_done and snt_done):
            # Ok, let's do it
            result = en_tokenize_and_segment(doc.text)
            doc.set_tokenization_result(
                list(zip(result['spans'], result['tokens'])))
            doc.set_sentencer_result(result['sentences'])
            doc.save()


def en_tokenize_and_segment(text):
    """
    Tokenizes and segments a string `text` interpreted as english text.
    Returns a tuple `(tokens, spans, sentences)` where:
        - tokens is a list of strings corresponding to the tokens in `text`.
        - spans is a list of the starting offsets i in `text` for each token in
          `tokens`.
        - sentences is a list of indexes that represent the start and end
          position of each sentence like this: the i-th sentence starts on
          token `sentences[i]` and ends on token `sentences[i + 1]`.
          There are `len(sentences) - 1` sentences represented in the list.
    """
    tokenizer = _get_tokenizer()

    tokens = []
    spans = []
    sentences = [0]
    for sentence_i, sentence_j, sentence in _split_in_sentences(text):
        if sentence_i == sentence_j:
            continue
        for i, j in tokenizer.span_tokenize(sentence):
            spans.append(sentence_i + i)
            tokens.append(sentence[i:j])
        sentences.append(len(tokens))
    return {'tokens': tokens,
            'spans': spans,
            'sentences': sentences}


def _split_in_sentences(text):
    if not nltk.data.path or nltk.data.path[-1] != DIRS.user_data_dir:
        nltk.data.path.append(DIRS.user_data_dir)
    sentence_splitter = nltk.data.load("tokenizers/punkt/english.pickle")
    for i, j in sentence_splitter.span_tokenize(text):
        yield i, j, text[i:j]


###
### English tokenizer using regular expressions
###


basic_macros = {
    "AN1": "[a-z0-9]",
    "AN2": "[a-z0-9\\._]",
    "AN3": r"[a-z0-9-_\.~!*'();:@&=+$,/?%#\[\]]"
}
macros = {
    "USERNAME": "{AN1}{AN2}*",
    "HOSTNAME": "{AN1}{AN2}*",
    "HOSTNAME2": r"{AN1}{AN2}*\.{AN2}*",
    "HOSTNAME3": r"{AN1}{AN2}*(:[0-9]{{1,5}})?",
    "HOSTNAME4": r"www\.{AN1}{AN2}*\.{AN2}*(:[0-9]{{1,5}})?",
    "SCHEME": "mailto:|((http|https|ftp|ftps|ssh|git|news)://)",
}
macros = {k: "(" + v.format(**basic_macros) + ")"
          for k, v in macros.items()}
macros.update(basic_macros)

# Smiley detection
eyes = ":;8xX>="
noses = [""] + list("-o")
mouths = list("DP/") + ["}}", "{{", "\\[", "\\]", "\\(", "\\)", "\\|"]
smileys = [x + y + z for x in eyes for y in noses for z in mouths]

HEADER = [
    "([01]?[0-9]|2[0-4]):[0-5]?[0-9](:[0-5]?[0-9])?",  # Time of day
    "''|``",                                           # Quotation
    "{USERNAME}@{HOSTNAME2}",                          # Typical email
    "{SCHEME}({USERNAME}@)?{HOSTNAME3}(/{AN3}*)?",     # URI
    "{HOSTNAME4}",                                     # Typical URL
]

FOOTER = [
    "\w+&\w+",                                         # And words
    "\w+",                                             # Normal words
    "|".join(smileys),                                 # Smileys
    "[()/\[\]\\.,;:\-\"'`~?]|\\.\\.\\.",               # Punctuation marks
    "\S+",                                             # Anything else
]


english_contractions = [
    "ain't",
    "aren't",
    "can't",
    "can't've",
    "'cause",
    "could've",
    "couldn't",
    "couldn't've",
    "didn't",
    "doesn't",
    "don't",
    "hadn't",
    "hadn't've",
    "hasn't",
    "haven't",
    "he'd",
    "he'd've",
    "he'll",
    "he'll've",
    "he's",
    "how'd",
    "how'd'y",
    "how'll",
    "how's",
    "I'd",
    "I'd've",
    "I'll",
    "I'll've",
    "I'm",
    "I've",
    "isn't",
    "it'd",
    "it'd've",
    "it'll",
    "it'll've",
    "it's",
    "let's",
    "ma'am",
    "might've",
    "mightn't",
    "mightn't've",
    "must've",
    "mustn't",
    "mustn't've",
    "needn't",
    "o'clock",
    "oughtn't",
    "oughtn't've",
    "shan't",
    "shan't've",
    "she'd",
    "she'd've",
    "she'll",
    "she'll've",
    "she's",
    "should've",
    "shouldn't",
    "shouldn't've",
    "so's",
    "that's",
    "there'd",
    "there's",
    "they'd",
    "they'll",
    "they'll've",
    "they're",
    "they've",
    "to've",
    "wasn't",
    "we'd",
    "we'll",
    "we'll've",
    "we're",
    "we've",
    "weren't",
    "what'll",
    "what'll've",
    "what're",
    "what's",
    "what've",
    "when's",
    "when've",
    "where'd",
    "where's",
    "where've",
    "who'll",
    "who'll've",
    "who's",
    "who've",
    "why's",
    "will've",
    "won't",
    "won't've",
    "would've",
    "wouldn't",
    "wouldn't've",
    "y'all",
    "y'all'd've",
    "y'all're",
    "y'all've",
    "you'd",
    "you'd've",
    "you'll",
    "you'll've",
    "you're",
    "you've"]

en_regex = HEADER + [
    "[01]?[0-9][-/.][0123]?[0-9][-/.][0-9]{{2,4}}",    # Date mm/dd/yyyy
    "|".join(english_contractions),                    # Common contractions
    "'s",                                              # Possesive
    "\w+([_-]\w+)+",                                   # Normal words+compounds
] + FOOTER


def _get_tokenizer(__cache=[]):
    """
    Get a tokenizer for english.
    """
    if not __cache:
        regex = [x.format(**macros) for x in en_regex]
        regex = u"|".join(regex)
        tokenizer = RegexpTokenizer(regex, flags=re.UNICODE |
                                                 re.MULTILINE |
                                                 re.DOTALL | re.I)
        __cache.append(tokenizer)
    return __cache[0]