test_db_preprocess_administration.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. from unittest import TestCase
  2. from unittest import mock
  3. from iepy.data.db import DocumentManager
  4. from iepy.data.models import IEDocument
  5. from iepy.preprocess.pipeline import PreProcessSteps
  6. from iepy.preprocess.ner.base import FoundEntity
  7. from iepy.preprocess.segmenter import RawSegment
  8. from .factories import IEDocFactory, SentencedIEDocFactory, naive_tkn
  9. from .manager_case import ManagerTestCase
  10. class TestDocumentCreationThruManager(ManagerTestCase):
  11. sample_id = 'sample-id'
  12. sample_text = 'this is a sample text'
  13. sample_metadata = {'iepy': 'rocks'}
  14. docmanager = DocumentManager()
  15. def test_create_basic(self):
  16. doc = self.docmanager.create_document(self.sample_id, self.sample_text,
  17. self.sample_metadata)
  18. self.assertEqual(doc.human_identifier, self.sample_id)
  19. self.assertEqual(doc.text, self.sample_text)
  20. self.assertEqual(doc.metadata.items, self.sample_metadata)
  21. self.assertEqual(IEDocument.objects.count(), 1)
  22. def test_create_existent_does_nothing(self):
  23. doc = self.docmanager.create_document(self.sample_id, self.sample_text,
  24. self.sample_metadata)
  25. doc2 = self.docmanager.create_document(self.sample_id, self.sample_text,
  26. self.sample_metadata)
  27. self.assertEqual(doc, doc2)
  28. self.assertEqual(IEDocument.objects.count(), 1)
  29. def test_doc_text_and_metadata_are_updated_if_enabled(self):
  30. new_text = self.sample_text + ' but longer'
  31. new_metadata = {'something': 'different'}
  32. self.docmanager.create_document(self.sample_id, self.sample_text,
  33. self.sample_metadata)
  34. doc = self.docmanager.create_document(self.sample_id, new_text,
  35. new_metadata)
  36. self.assertNotEqual(doc.text, new_text)
  37. self.assertEqual(doc.text, self.sample_text)
  38. self.assertNotEqual(doc.metadata.items, new_metadata)
  39. self.assertEqual(doc.metadata.items, self.sample_metadata)
  40. doc = self.docmanager.create_document(self.sample_id, new_text,
  41. new_metadata, update_mode=True)
  42. self.assertEqual(doc.text, new_text)
  43. self.assertEqual(doc.metadata.items, new_metadata)
  44. class TestDocumentsPreprocessMetadata(ManagerTestCase):
  45. def test_preprocess_steps(self):
  46. self.assertEqual(
  47. [p.name for p in PreProcessSteps],
  48. ['tokenization', 'lemmatization', 'sentencer', 'tagging', 'ner', 'segmentation', 'syntactic_parsing'])
  49. def test_just_created_document_has_no_preprocess_done(self):
  50. doc = IEDocFactory()
  51. for step in PreProcessSteps:
  52. self.assertFalse(doc.was_preprocess_step_done(step))
  53. def test_cannot_set_sentencer_if_not_tokenization_stored(self):
  54. doc = IEDocFactory(text='Some sentence.')
  55. sentences = [0, 3]
  56. self.assertRaises(ValueError, doc.set_sentencer_result, sentences)
  57. self.assertFalse(doc.was_preprocess_step_done(PreProcessSteps.sentencer))
  58. def test_cannot_set_sentences_larger_than_tokens(self):
  59. # sentencer numbers must be valid indexes of tokens list
  60. doc = IEDocFactory(text='Some sentence.')
  61. doc.set_tokenization_result(naive_tkn(doc.text))
  62. sentences = [35]
  63. self.assertRaises(ValueError, doc.set_sentencer_result, sentences)
  64. self.assertFalse(doc.was_preprocess_step_done(PreProcessSteps.sentencer))
  65. def test_sentencer_result_must_be_ordered_list_of_numbers(self):
  66. doc = IEDocFactory(text='Some sentence . And some other . Indeed !')
  67. sentences = [7, 3, 0]
  68. self.assertRaises(ValueError, doc.set_sentencer_result, sentences)
  69. # also must be strictly ascending
  70. sentences = [0, 0, 3]
  71. self.assertRaises(ValueError, doc.set_sentencer_result, sentences)
  72. self.assertFalse(doc.was_preprocess_step_done(PreProcessSteps.sentencer))
  73. def test_cannot_set_tagging_result_of_different_cardinality_than_tokens(self):
  74. doc = IEDocFactory(text='Some sentence')
  75. doc.set_tokenization_result(naive_tkn(doc.text))
  76. step = PreProcessSteps.tagging
  77. for tags in [['NN'], ['NN', 'POS', 'VB']]:
  78. self.assertRaises(ValueError, doc.set_tagging_result, tags)
  79. self.assertFalse(doc.was_preprocess_step_done(step))
  80. def test_setting_tagging_result_can_be_later_retrieved(self):
  81. doc = IEDocFactory(text='Some sentence. And some other. Indeed !')
  82. tokens = naive_tkn(doc.text)
  83. doc.set_tokenization_result(tokens)
  84. simple_tags = ['NN' for token in tokens]
  85. doc.set_tagging_result(simple_tags)
  86. self.assertTrue(doc.was_preprocess_step_done(PreProcessSteps.tagging))
  87. class TestStorePreprocessOutputSideEffects(ManagerTestCase):
  88. def doc_ready_for(self, desired_step, save=True):
  89. # creates and returns a new document, with all prev steps than "desired_step"
  90. # already in place (and saved unless explicitely stated), together with a
  91. # valid result for the desired step
  92. text = 'Hello world.'
  93. doc = IEDocFactory(text=text)
  94. sample_values = [
  95. (PreProcessSteps.tokenization, [(0, 'Hello'), (6, 'world'), (11, '.')]),
  96. (PreProcessSteps.lemmatization, ['hello', 'world', '.']),
  97. (PreProcessSteps.sentencer, [0, 3]),
  98. (PreProcessSteps.tagging, ['NN', 'NN', '.']),
  99. (PreProcessSteps.ner, [FoundEntity('world', 'LOCATION', 'world', 1, 2, False)]),
  100. (PreProcessSteps.segmentation, [RawSegment(0, 3, None)]),
  101. (PreProcessSteps.syntactic_parsing, ["(ROOT (NP (JJ Hello) (NN world) (. .)))]"]),
  102. ]
  103. for step, value in sample_values:
  104. if step == desired_step:
  105. if save:
  106. doc.save()
  107. return doc, value
  108. else:
  109. setter = getattr(doc, 'set_%s_result' % step.name)
  110. doc = setter(value)
  111. def setter_of(self, doc, step):
  112. return getattr(doc, 'set_%s_result' % step.name)
  113. def test_setter_methods_return_same_document_with_result_stored(self):
  114. for step in PreProcessSteps:
  115. doc, value = self.doc_ready_for(step)
  116. setter = self.setter_of(doc, step)
  117. self.assertEqual(doc, setter(value))
  118. self.assertTrue(doc.was_preprocess_step_done(step))
  119. def test_setter_do_not_save(self):
  120. for step in PreProcessSteps:
  121. doc, value = self.doc_ready_for(step)
  122. setter = self.setter_of(doc, step)
  123. with mock.patch.object(doc, 'save') as doc_save:
  124. setter(value)
  125. self.assertFalse(doc_save.called)
  126. from_db = IEDocument.objects.get(pk=doc.pk)
  127. self.assertFalse(from_db.was_preprocess_step_done(step))
  128. class TestDocumentManagerFiltersForPreprocess(ManagerTestCase):
  129. ManagerClass = DocumentManager
  130. def test_manager_itself_iterates_over_all_documents(self):
  131. doc1 = IEDocFactory(text='')
  132. doc2 = IEDocFactory(text='something')
  133. doc3 = SentencedIEDocFactory(text='Some sentence. And some other. Indeed!')
  134. self.assertIn(doc1, self.manager)
  135. self.assertIn(doc2, self.manager)
  136. self.assertIn(doc3, self.manager)
  137. def test_raw_documents_are_filtered(self):
  138. doc1 = IEDocFactory(text='')
  139. doc2 = IEDocFactory(text='something')
  140. raws = self.manager.get_raw_documents()
  141. self.assertIn(doc1, raws)
  142. self.assertNotIn(doc2, raws)
  143. def test_untokenized_documents_are_filtered(self):
  144. doc1 = IEDocFactory(text='')
  145. doc2 = IEDocFactory(text='something')
  146. doc3 = IEDocFactory(text='something nice')
  147. doc4 = IEDocFactory(text='')
  148. step = PreProcessSteps.tokenization
  149. doc3.set_tokenization_result(naive_tkn(doc3.text)).save()
  150. doc4.set_tokenization_result([]).save()
  151. untokeneds = self.manager.get_documents_lacking_preprocess(step)
  152. self.assertIn(doc1, untokeneds)
  153. self.assertIn(doc2, untokeneds)
  154. self.assertNotIn(doc3, untokeneds)
  155. self.assertNotIn(doc4, untokeneds)
  156. def test_unsentenced_documents_are_filtered(self):
  157. doc1 = IEDocFactory(text='something nice')
  158. doc2 = IEDocFactory(text='something nicer')
  159. doc3 = IEDocFactory(text='something even nicer')
  160. doc2.set_tokenization_result(naive_tkn(doc2.text)).save()
  161. doc3.set_tokenization_result(naive_tkn(doc3.text)).save()
  162. doc3.set_sentencer_result([0, 3]).save()
  163. unsentenced = self.manager.get_documents_lacking_preprocess(
  164. PreProcessSteps.sentencer)
  165. self.assertIn(doc1, unsentenced)
  166. self.assertIn(doc2, unsentenced)
  167. self.assertNotIn(doc3, unsentenced)
  168. def test_can_get_both_unsegmented_or_unsynparsed_documents(self):
  169. doc1 = SentencedIEDocFactory(text='Something nice.')
  170. doc2 = SentencedIEDocFactory(text='Something even nicer.')
  171. doc3 = SentencedIEDocFactory(text='Some sentence. And some other. Indeed!')
  172. def filter():
  173. return self.manager.get_documents_lacking_preprocess(
  174. [PreProcessSteps.segmentation, PreProcessSteps.syntactic_parsing]
  175. )
  176. docs = [doc1, doc2, doc3]
  177. self.assertEqual(list(filter()), docs)
  178. for d in docs:
  179. d.set_segmentation_result([RawSegment(0, 3, None)])
  180. d.save()
  181. self.assertEqual(list(filter()), docs)
  182. doc1.set_syntactic_parsing_result(["(ROOT (NP (JJ Hello) (NN world) (. .)))]"])
  183. doc1.save()
  184. self.assertEqual(list(filter()), [doc2, doc3])
  185. class TestDocumentSentenceIterator(TestCase):
  186. def test_right_number_of_sentences_are_returned(self):
  187. doc = SentencedIEDocFactory(text='Some sentence. And some other. Indeed!')
  188. sentencing = doc.sentences
  189. sentences = [s for s in doc.get_sentences()]
  190. self.assertEqual(len(sentencing) - 1, len(sentences))
  191. def test_tokens_are_preserved(self):
  192. doc = SentencedIEDocFactory(text='Some sentence. And some other. Indeed!')
  193. sentences = [s for s in doc.get_sentences()]
  194. output_tokens = sum(sentences, [])
  195. self.assertEqual(doc.tokens, output_tokens)