123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252 |
- import logging
- from tempfile import NamedTemporaryFile
- import sys
- import factory
- import nltk
- from iepy.data.models import (
- IEDocument, EntityOccurrence,
- TextSegment, Relation,
- EvidenceCandidate,
- )
- def naive_tkn(text):
- """Makes a naive tokenization returning pairs of tokens and
- offsets. Note, generated offsets are just numbers, to make things easy.
- """
- return list(enumerate(text.split()))
- # In general, we are not interested on the debug and info messages
- # of Factory-Boy itself
- logging.getLogger("factory").setLevel(logging.WARN)
- # declared like this to "facilitate" changing the ORM
- BaseFactory = factory.django.DjangoModelFactory
- class IEDocumentMetadataFactory(BaseFactory):
- class Meta:
- model = 'corpus.IEDocumentMetadata'
- class EntityKindFactory(BaseFactory):
- class Meta:
- model = 'corpus.EntityKind'
- django_get_or_create = ('name', )
- name = factory.Sequence(lambda n: 'kind_%i' % n)
- class EntityFactory(BaseFactory):
- class Meta:
- model = 'corpus.Entity'
- django_get_or_create = ('key', 'kind', )
- key = factory.Sequence(lambda n: 'id:%i' % n)
- kind = factory.SubFactory(EntityKindFactory)
- class EntityOccurrenceFactory(BaseFactory):
- FACTORY_FOR = EntityOccurrence
- entity = factory.SubFactory(EntityFactory)
- offset = 0
- offset_end = 1
- alias = ''
- class IEDocFactory(BaseFactory):
- FACTORY_FOR = IEDocument
- metadata = factory.SubFactory(IEDocumentMetadataFactory)
- human_identifier = factory.Sequence(lambda n: 'doc_%i' % n)
- text = factory.Sequence(lambda n: 'Lorem ipsum yaba daba du! %i' % n)
- class TextSegmentFactory(BaseFactory):
- FACTORY_FOR = TextSegment
- document = factory.SubFactory(IEDocFactory)
- offset = factory.Sequence(lambda n: n * 3)
- offset_end = factory.Sequence(lambda n: n * 3 + 1)
- class SentencedIEDocFactory(IEDocFactory):
- FACTORY_FOR = IEDocument
- metadata = factory.SubFactory(IEDocumentMetadataFactory)
- text = factory.Sequence(lambda n: 'Lorem ipsum. Yaba daba du! %i' % n)
- @factory.post_generation
- def init(self, create, extracted, **kwargs):
- tokens = []
- sentences = [0]
- for sent in nltk.sent_tokenize(self.text):
- sent_tokens = nltk.word_tokenize(sent)
- tokens.extend(list(enumerate(sent_tokens)))
- sentences.append(sentences[-1] + len(sent_tokens))
- self.set_tokenization_result(tokens)
- self.set_sentencer_result(sentences)
- class SyntacticParsedIEDocFactory(IEDocFactory):
- FACTORY_FOR = IEDocument
- metadata = factory.SubFactory(IEDocumentMetadataFactory)
- # This factory will always return
- # the same sentences and trees
- @factory.post_generation
- def init(self, create, extracted, **kwargs):
- sentences_amount = 20
- tokens = []
- sentences = [0]
- for sent_tokens in nltk.corpus.treebank.sents()[:sentences_amount]:
- tokens.extend(list(enumerate(sent_tokens)))
- sentences.append(sentences[-1] + len(sent_tokens))
- self.set_tokenization_result(tokens)
- self.set_sentencer_result(sentences)
- tree_strings = [x.pprint() for x in nltk.corpus.treebank.parsed_sents()[:sentences_amount]]
- self.set_syntactic_parsing_result(tree_strings)
- class RelationFactory(BaseFactory):
- FACTORY_FOR = Relation
- name = factory.Sequence(lambda n: 'relation:%i' % n)
- left_entity_kind = factory.SubFactory(EntityKindFactory)
- right_entity_kind = factory.SubFactory(EntityKindFactory)
- def NamedTemporaryFile23(*args, **kwargs):
- """Works exactly as a wrapper to tempfile.NamedTemporaryFile except that
- in python2.x, it excludes the "encoding" parameter when provided."""
- if sys.version_info[0] == 2: # Python 2
- kwargs.pop('encoding', None)
- return NamedTemporaryFile(*args, **kwargs)
- class EvidenceCandidateFactory(BaseFactory):
- FACTORY_FOR = EvidenceCandidate
- segment = factory.SubFactory(TextSegmentFactory)
- left_entity_occurrence = factory.SubFactory(
- EntityOccurrenceFactory,
- document=factory.SelfAttribute('..segment.document')
- )
- right_entity_occurrence = factory.SubFactory(
- EntityOccurrenceFactory,
- document=factory.SelfAttribute('..segment.document')
- )
- class EvidenceFactory(BaseFactory):
- """Factory for Evidence instances()
- In addition to the usual Factory Boy behavior, this factory also accepts a
- 'markup' argument. The markup is a string with the tokens of the text
- segment separated by entities. You can flag entities by entering them as
- {token token token|kind}. You can also use kind* to flag the first/right
- occurrence used for the fact, and kind** to flag the second/left.
- For example, the following is valid markup:
- "The physicist {Albert Einstein|Person*} was born in {Germany|location} and
- died in the {United States|location**} ."
- """
- FACTORY_FOR = EvidenceCandidate
- segment = factory.SubFactory(TextSegmentFactory)
- right_entity_occurrence = factory.SubFactory(
- EntityOccurrenceFactory,
- document=factory.SelfAttribute('..segment.document')
- )
- left_entity_occurrence = factory.SubFactory(
- EntityOccurrenceFactory,
- document=factory.SelfAttribute('..segment.document')
- )
- @classmethod
- def create(cls, **kwargs):
- def eo_args(tokens, eotokens, kind):
- txt = ' '.join(eotokens)
- return {
- 'entity__key': txt,
- 'entity__kind__name': kind,
- 'alias': txt,
- 'offset': len(tokens),
- 'offset_end': len(tokens) + len(eotokens)
- }
- args = {}
- markup = kwargs.pop('markup', None)
- if markup is not None:
- # will consider the document as having exactly the same than this segment
- tokens = []
- e_occurrences = []
- while markup:
- if markup.startswith("{"):
- closer = markup.index("}")
- entity = markup[1:closer]
- markup = markup[closer+1:].lstrip()
- eotokens, eokind = entity.split('|')
- eotokens = eotokens.split()
- eo_flags = eokind.count('*')
- eokind = eokind.strip('*')
- eo_args_ = eo_args(tokens, eotokens, eokind)
- if eo_flags == 2:
- args.update(
- {'left_entity_occurrence__%s' % k: v
- for k, v in eo_args_.items()}
- )
- elif eo_flags == 1:
- args.update(
- {'right_entity_occurrence__%s' % k: v
- for k, v in eo_args_.items()}
- )
- else:
- e_occurrences.append((eotokens, len(tokens), eokind))
- tokens += eotokens
- elif ' ' in markup:
- token, markup = markup.split(' ', 1)
- tokens.append(token)
- else:
- tokens.append(markup)
- markup = ''
- args["segment__document__text"] = " ".join(tokens)
- args["segment__document__tokens"] = tokens
- args["segment__offset"] = 0
- args["segment__offset_end"] = len(tokens)
- args["e_occurrences"] = e_occurrences
- if "syntactic_sentence" in kwargs:
- syntactic_sentence = kwargs.pop("syntactic_sentence")
- if isinstance(syntactic_sentence, str):
- syntactic_sentence = nltk.tree.Tree.fromstring(syntactic_sentence)
- args["segment__document__sentences"] = [0]
- args["segment__document__syntactic_sentences"] = [syntactic_sentence]
- args.update(kwargs)
- return super(EvidenceFactory, cls).create(**args)
- @factory.post_generation
- def e_occurrences(self, create, extracted, **kwargs):
- doc = self.segment.document
- for eotokens, offset, kind_name in extracted:
- alias = ' '.join(eotokens)
- EntityOccurrenceFactory(
- entity__kind__name=kind_name,
- entity__key=alias,
- alias=alias,
- document=doc,
- offset=offset,
- offset_end=offset + len(eotokens),
- )
- # Now that all were created, check which shall be included for segment
- self.segment.entity_occurrences = doc.entity_occurrences.filter(
- offset__gte=self.segment.offset,
- offset_end__lte=self.segment.offset_end
- )
- class GazetteItemFactory(BaseFactory):
- class Meta:
- model = 'corpus.GazetteItem'
- kind = factory.SubFactory(EntityKindFactory)
- text = factory.Sequence(lambda n: 'gazette_item_%i' % n)
|