luojiehua
/
iepy-develop


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
							import logging
from tempfile import NamedTemporaryFile
import sys

import factory
import nltk

from iepy.data.models import (
    IEDocument, EntityOccurrence,
    TextSegment, Relation,
    EvidenceCandidate,
)


def naive_tkn(text):
    """Makes a naive tokenization returning pairs of tokens and
    offsets. Note, generated offsets are just numbers, to make things easy.
    """
    return list(enumerate(text.split()))


# In general, we are not interested on the debug and info messages
# of Factory-Boy itself
logging.getLogger("factory").setLevel(logging.WARN)

# declared like this to "facilitate" changing the ORM
BaseFactory = factory.django.DjangoModelFactory


class IEDocumentMetadataFactory(BaseFactory):
    class Meta:
        model = 'corpus.IEDocumentMetadata'


class EntityKindFactory(BaseFactory):
    class Meta:
        model = 'corpus.EntityKind'
        django_get_or_create = ('name', )
    name = factory.Sequence(lambda n: 'kind_%i' % n)


class EntityFactory(BaseFactory):
    class Meta:
        model = 'corpus.Entity'
        django_get_or_create = ('key', 'kind', )
    key = factory.Sequence(lambda n: 'id:%i' % n)
    kind = factory.SubFactory(EntityKindFactory)


class EntityOccurrenceFactory(BaseFactory):
    FACTORY_FOR = EntityOccurrence
    entity = factory.SubFactory(EntityFactory)
    offset = 0
    offset_end = 1
    alias = ''


class IEDocFactory(BaseFactory):
    FACTORY_FOR = IEDocument
    metadata = factory.SubFactory(IEDocumentMetadataFactory)
    human_identifier = factory.Sequence(lambda n: 'doc_%i' % n)
    text = factory.Sequence(lambda n: 'Lorem ipsum yaba daba du! %i' % n)


class TextSegmentFactory(BaseFactory):
    FACTORY_FOR = TextSegment
    document = factory.SubFactory(IEDocFactory)
    offset = factory.Sequence(lambda n: n * 3)
    offset_end = factory.Sequence(lambda n: n * 3 + 1)


class SentencedIEDocFactory(IEDocFactory):
    FACTORY_FOR = IEDocument
    metadata = factory.SubFactory(IEDocumentMetadataFactory)
    text = factory.Sequence(lambda n: 'Lorem ipsum. Yaba daba du! %i' % n)

    @factory.post_generation
    def init(self, create, extracted, **kwargs):
        tokens = []
        sentences = [0]
        for sent in nltk.sent_tokenize(self.text):
            sent_tokens = nltk.word_tokenize(sent)
            tokens.extend(list(enumerate(sent_tokens)))
            sentences.append(sentences[-1] + len(sent_tokens))

        self.set_tokenization_result(tokens)
        self.set_sentencer_result(sentences)


class SyntacticParsedIEDocFactory(IEDocFactory):
    FACTORY_FOR = IEDocument
    metadata = factory.SubFactory(IEDocumentMetadataFactory)

    # This factory will always return
    # the same sentences and trees

    @factory.post_generation
    def init(self, create, extracted, **kwargs):
        sentences_amount = 20

        tokens = []
        sentences = [0]
        for sent_tokens in nltk.corpus.treebank.sents()[:sentences_amount]:
            tokens.extend(list(enumerate(sent_tokens)))
            sentences.append(sentences[-1] + len(sent_tokens))

        self.set_tokenization_result(tokens)
        self.set_sentencer_result(sentences)
        tree_strings = [x.pprint() for x in nltk.corpus.treebank.parsed_sents()[:sentences_amount]]
        self.set_syntactic_parsing_result(tree_strings)


class RelationFactory(BaseFactory):
    FACTORY_FOR = Relation
    name = factory.Sequence(lambda n: 'relation:%i' % n)
    left_entity_kind = factory.SubFactory(EntityKindFactory)
    right_entity_kind = factory.SubFactory(EntityKindFactory)


def NamedTemporaryFile23(*args, **kwargs):
    """Works exactly as a wrapper to tempfile.NamedTemporaryFile except that
       in python2.x, it excludes the "encoding" parameter when provided."""
    if sys.version_info[0] == 2:  # Python 2
        kwargs.pop('encoding', None)
    return NamedTemporaryFile(*args, **kwargs)


class EvidenceCandidateFactory(BaseFactory):
    FACTORY_FOR = EvidenceCandidate
    segment = factory.SubFactory(TextSegmentFactory)
    left_entity_occurrence = factory.SubFactory(
        EntityOccurrenceFactory,
        document=factory.SelfAttribute('..segment.document')
    )
    right_entity_occurrence = factory.SubFactory(
        EntityOccurrenceFactory,
        document=factory.SelfAttribute('..segment.document')
    )


class EvidenceFactory(BaseFactory):
    """Factory for Evidence instances()

    In addition to the usual Factory Boy behavior, this factory also accepts a
    'markup' argument. The markup is a string with the tokens of the text
    segment separated by entities. You can flag entities by entering them as
    {token token token|kind}. You can also use kind* to flag the first/right
    occurrence used for the fact, and kind** to flag the second/left.

    For example, the following is valid markup:

    "The physicist {Albert Einstein|Person*} was born in {Germany|location} and
    died in the {United States|location**} ."
    """
    FACTORY_FOR = EvidenceCandidate
    segment = factory.SubFactory(TextSegmentFactory)
    right_entity_occurrence = factory.SubFactory(
        EntityOccurrenceFactory,
        document=factory.SelfAttribute('..segment.document')
    )
    left_entity_occurrence = factory.SubFactory(
        EntityOccurrenceFactory,
        document=factory.SelfAttribute('..segment.document')
    )

    @classmethod
    def create(cls, **kwargs):
        def eo_args(tokens, eotokens, kind):
            txt = ' '.join(eotokens)
            return {
                'entity__key': txt,
                'entity__kind__name': kind,
                'alias': txt,
                'offset': len(tokens),
                'offset_end': len(tokens) + len(eotokens)
            }
        args = {}
        markup = kwargs.pop('markup', None)
        if markup is not None:
            # will consider the document as having exactly the same than this segment
            tokens = []
            e_occurrences = []
            while markup:
                if markup.startswith("{"):
                    closer = markup.index("}")
                    entity = markup[1:closer]
                    markup = markup[closer+1:].lstrip()
                    eotokens, eokind = entity.split('|')
                    eotokens = eotokens.split()
                    eo_flags = eokind.count('*')
                    eokind = eokind.strip('*')
                    eo_args_ = eo_args(tokens, eotokens, eokind)

                    if eo_flags == 2:
                        args.update(
                            {'left_entity_occurrence__%s' % k: v
                             for k, v in eo_args_.items()}
                        )
                    elif eo_flags == 1:
                        args.update(
                            {'right_entity_occurrence__%s' % k: v
                             for k, v in eo_args_.items()}
                        )
                    else:
                        e_occurrences.append((eotokens, len(tokens), eokind))
                    tokens += eotokens
                elif ' ' in markup:
                    token, markup = markup.split(' ', 1)
                    tokens.append(token)
                else:
                    tokens.append(markup)
                    markup = ''
            args["segment__document__text"] = " ".join(tokens)
            args["segment__document__tokens"] = tokens
            args["segment__offset"] = 0
            args["segment__offset_end"] = len(tokens)
            args["e_occurrences"] = e_occurrences
            if "syntactic_sentence" in kwargs:
                syntactic_sentence = kwargs.pop("syntactic_sentence")
                if isinstance(syntactic_sentence, str):
                    syntactic_sentence = nltk.tree.Tree.fromstring(syntactic_sentence)
                args["segment__document__sentences"] = [0]
                args["segment__document__syntactic_sentences"] = [syntactic_sentence]

        args.update(kwargs)
        return super(EvidenceFactory, cls).create(**args)

    @factory.post_generation
    def e_occurrences(self, create, extracted, **kwargs):
        doc = self.segment.document
        for eotokens, offset, kind_name in extracted:
            alias = ' '.join(eotokens)
            EntityOccurrenceFactory(
                entity__kind__name=kind_name,
                entity__key=alias,
                alias=alias,
                document=doc,
                offset=offset,
                offset_end=offset + len(eotokens),
            )
        # Now that all were created, check which shall be included for segment
        self.segment.entity_occurrences = doc.entity_occurrences.filter(
            offset__gte=self.segment.offset,
            offset_end__lte=self.segment.offset_end
        )


class GazetteItemFactory(BaseFactory):
    class Meta:
        model = 'corpus.GazetteItem'
    kind = factory.SubFactory(EntityKindFactory)
    text = factory.Sequence(lambda n: 'gazette_item_%i' % n)