factories.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. import logging
  2. from tempfile import NamedTemporaryFile
  3. import sys
  4. import factory
  5. import nltk
  6. from iepy.data.models import (
  7. IEDocument, EntityOccurrence,
  8. TextSegment, Relation,
  9. EvidenceCandidate,
  10. )
  11. def naive_tkn(text):
  12. """Makes a naive tokenization returning pairs of tokens and
  13. offsets. Note, generated offsets are just numbers, to make things easy.
  14. """
  15. return list(enumerate(text.split()))
  16. # In general, we are not interested on the debug and info messages
  17. # of Factory-Boy itself
  18. logging.getLogger("factory").setLevel(logging.WARN)
  19. # declared like this to "facilitate" changing the ORM
  20. BaseFactory = factory.django.DjangoModelFactory
  21. class IEDocumentMetadataFactory(BaseFactory):
  22. class Meta:
  23. model = 'corpus.IEDocumentMetadata'
  24. class EntityKindFactory(BaseFactory):
  25. class Meta:
  26. model = 'corpus.EntityKind'
  27. django_get_or_create = ('name', )
  28. name = factory.Sequence(lambda n: 'kind_%i' % n)
  29. class EntityFactory(BaseFactory):
  30. class Meta:
  31. model = 'corpus.Entity'
  32. django_get_or_create = ('key', 'kind', )
  33. key = factory.Sequence(lambda n: 'id:%i' % n)
  34. kind = factory.SubFactory(EntityKindFactory)
  35. class EntityOccurrenceFactory(BaseFactory):
  36. FACTORY_FOR = EntityOccurrence
  37. entity = factory.SubFactory(EntityFactory)
  38. offset = 0
  39. offset_end = 1
  40. alias = ''
  41. class IEDocFactory(BaseFactory):
  42. FACTORY_FOR = IEDocument
  43. metadata = factory.SubFactory(IEDocumentMetadataFactory)
  44. human_identifier = factory.Sequence(lambda n: 'doc_%i' % n)
  45. text = factory.Sequence(lambda n: 'Lorem ipsum yaba daba du! %i' % n)
  46. class TextSegmentFactory(BaseFactory):
  47. FACTORY_FOR = TextSegment
  48. document = factory.SubFactory(IEDocFactory)
  49. offset = factory.Sequence(lambda n: n * 3)
  50. offset_end = factory.Sequence(lambda n: n * 3 + 1)
  51. class SentencedIEDocFactory(IEDocFactory):
  52. FACTORY_FOR = IEDocument
  53. metadata = factory.SubFactory(IEDocumentMetadataFactory)
  54. text = factory.Sequence(lambda n: 'Lorem ipsum. Yaba daba du! %i' % n)
  55. @factory.post_generation
  56. def init(self, create, extracted, **kwargs):
  57. tokens = []
  58. sentences = [0]
  59. for sent in nltk.sent_tokenize(self.text):
  60. sent_tokens = nltk.word_tokenize(sent)
  61. tokens.extend(list(enumerate(sent_tokens)))
  62. sentences.append(sentences[-1] + len(sent_tokens))
  63. self.set_tokenization_result(tokens)
  64. self.set_sentencer_result(sentences)
  65. class SyntacticParsedIEDocFactory(IEDocFactory):
  66. FACTORY_FOR = IEDocument
  67. metadata = factory.SubFactory(IEDocumentMetadataFactory)
  68. # This factory will always return
  69. # the same sentences and trees
  70. @factory.post_generation
  71. def init(self, create, extracted, **kwargs):
  72. sentences_amount = 20
  73. tokens = []
  74. sentences = [0]
  75. for sent_tokens in nltk.corpus.treebank.sents()[:sentences_amount]:
  76. tokens.extend(list(enumerate(sent_tokens)))
  77. sentences.append(sentences[-1] + len(sent_tokens))
  78. self.set_tokenization_result(tokens)
  79. self.set_sentencer_result(sentences)
  80. tree_strings = [x.pprint() for x in nltk.corpus.treebank.parsed_sents()[:sentences_amount]]
  81. self.set_syntactic_parsing_result(tree_strings)
  82. class RelationFactory(BaseFactory):
  83. FACTORY_FOR = Relation
  84. name = factory.Sequence(lambda n: 'relation:%i' % n)
  85. left_entity_kind = factory.SubFactory(EntityKindFactory)
  86. right_entity_kind = factory.SubFactory(EntityKindFactory)
  87. def NamedTemporaryFile23(*args, **kwargs):
  88. """Works exactly as a wrapper to tempfile.NamedTemporaryFile except that
  89. in python2.x, it excludes the "encoding" parameter when provided."""
  90. if sys.version_info[0] == 2: # Python 2
  91. kwargs.pop('encoding', None)
  92. return NamedTemporaryFile(*args, **kwargs)
  93. class EvidenceCandidateFactory(BaseFactory):
  94. FACTORY_FOR = EvidenceCandidate
  95. segment = factory.SubFactory(TextSegmentFactory)
  96. left_entity_occurrence = factory.SubFactory(
  97. EntityOccurrenceFactory,
  98. document=factory.SelfAttribute('..segment.document')
  99. )
  100. right_entity_occurrence = factory.SubFactory(
  101. EntityOccurrenceFactory,
  102. document=factory.SelfAttribute('..segment.document')
  103. )
  104. class EvidenceFactory(BaseFactory):
  105. """Factory for Evidence instances()
  106. In addition to the usual Factory Boy behavior, this factory also accepts a
  107. 'markup' argument. The markup is a string with the tokens of the text
  108. segment separated by entities. You can flag entities by entering them as
  109. {token token token|kind}. You can also use kind* to flag the first/right
  110. occurrence used for the fact, and kind** to flag the second/left.
  111. For example, the following is valid markup:
  112. "The physicist {Albert Einstein|Person*} was born in {Germany|location} and
  113. died in the {United States|location**} ."
  114. """
  115. FACTORY_FOR = EvidenceCandidate
  116. segment = factory.SubFactory(TextSegmentFactory)
  117. right_entity_occurrence = factory.SubFactory(
  118. EntityOccurrenceFactory,
  119. document=factory.SelfAttribute('..segment.document')
  120. )
  121. left_entity_occurrence = factory.SubFactory(
  122. EntityOccurrenceFactory,
  123. document=factory.SelfAttribute('..segment.document')
  124. )
  125. @classmethod
  126. def create(cls, **kwargs):
  127. def eo_args(tokens, eotokens, kind):
  128. txt = ' '.join(eotokens)
  129. return {
  130. 'entity__key': txt,
  131. 'entity__kind__name': kind,
  132. 'alias': txt,
  133. 'offset': len(tokens),
  134. 'offset_end': len(tokens) + len(eotokens)
  135. }
  136. args = {}
  137. markup = kwargs.pop('markup', None)
  138. if markup is not None:
  139. # will consider the document as having exactly the same than this segment
  140. tokens = []
  141. e_occurrences = []
  142. while markup:
  143. if markup.startswith("{"):
  144. closer = markup.index("}")
  145. entity = markup[1:closer]
  146. markup = markup[closer+1:].lstrip()
  147. eotokens, eokind = entity.split('|')
  148. eotokens = eotokens.split()
  149. eo_flags = eokind.count('*')
  150. eokind = eokind.strip('*')
  151. eo_args_ = eo_args(tokens, eotokens, eokind)
  152. if eo_flags == 2:
  153. args.update(
  154. {'left_entity_occurrence__%s' % k: v
  155. for k, v in eo_args_.items()}
  156. )
  157. elif eo_flags == 1:
  158. args.update(
  159. {'right_entity_occurrence__%s' % k: v
  160. for k, v in eo_args_.items()}
  161. )
  162. else:
  163. e_occurrences.append((eotokens, len(tokens), eokind))
  164. tokens += eotokens
  165. elif ' ' in markup:
  166. token, markup = markup.split(' ', 1)
  167. tokens.append(token)
  168. else:
  169. tokens.append(markup)
  170. markup = ''
  171. args["segment__document__text"] = " ".join(tokens)
  172. args["segment__document__tokens"] = tokens
  173. args["segment__offset"] = 0
  174. args["segment__offset_end"] = len(tokens)
  175. args["e_occurrences"] = e_occurrences
  176. if "syntactic_sentence" in kwargs:
  177. syntactic_sentence = kwargs.pop("syntactic_sentence")
  178. if isinstance(syntactic_sentence, str):
  179. syntactic_sentence = nltk.tree.Tree.fromstring(syntactic_sentence)
  180. args["segment__document__sentences"] = [0]
  181. args["segment__document__syntactic_sentences"] = [syntactic_sentence]
  182. args.update(kwargs)
  183. return super(EvidenceFactory, cls).create(**args)
  184. @factory.post_generation
  185. def e_occurrences(self, create, extracted, **kwargs):
  186. doc = self.segment.document
  187. for eotokens, offset, kind_name in extracted:
  188. alias = ' '.join(eotokens)
  189. EntityOccurrenceFactory(
  190. entity__kind__name=kind_name,
  191. entity__key=alias,
  192. alias=alias,
  193. document=doc,
  194. offset=offset,
  195. offset_end=offset + len(eotokens),
  196. )
  197. # Now that all were created, check which shall be included for segment
  198. self.segment.entity_occurrences = doc.entity_occurrences.filter(
  199. offset__gte=self.segment.offset,
  200. offset_end__lte=self.segment.offset_end
  201. )
  202. class GazetteItemFactory(BaseFactory):
  203. class Meta:
  204. model = 'corpus.GazetteItem'
  205. kind = factory.SubFactory(EntityKindFactory)
  206. text = factory.Sequence(lambda n: 'gazette_item_%i' % n)