test_stanford_preprocess.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. from unittest import TestCase, mock
  2. from datetime import datetime
  3. from .factories import (IEDocFactory, SentencedIEDocFactory, GazetteItemFactory,
  4. EntityOccurrenceFactory, EntityKindFactory)
  5. from .manager_case import ManagerTestCase
  6. from iepy.preprocess.pipeline import PreProcessSteps
  7. from iepy.preprocess.stanford_preprocess import (
  8. StanfordPreprocess, GazetteManager, apply_coreferences, CoreferenceError,
  9. StanfordAnalysis)
  10. class TestableStanfordAnalysis(StanfordAnalysis):
  11. def __init__(self, hacked_sentences, *args):
  12. self.hacked_sentences = hacked_sentences
  13. super().__init__({})
  14. def get_sentences(self):
  15. return self.hacked_sentences
  16. def sentence_factory(markup):
  17. """Simplistic builder of *parsed* sentences.
  18. Each line in the markup is interpreted as a whitespace-separated-values for
  19. token offset-in-chars ner lemma
  20. which are returned as a list of dicts.
  21. """
  22. sentence = []
  23. for line in markup.split("\n"):
  24. line = line.strip()
  25. if not line:
  26. continue
  27. token, offset, ner, lemma = line.split()
  28. sentence.append({
  29. "word": token,
  30. "CharacterOffsetBegin": offset,
  31. "NER": ner,
  32. "lemma": lemma,
  33. })
  34. return sentence
  35. def get_analysis_from_sent_markup(markup):
  36. sentences = [sentence_factory(markup)]
  37. return TestableStanfordAnalysis(hacked_sentences=sentences)
  38. class TestSentenceFunctions(TestCase):
  39. def test_get_tokens_simple(self):
  40. analysis = get_analysis_from_sent_markup("""
  41. friends x x x
  42. will x x x
  43. be x x x
  44. friends x x x
  45. """)
  46. X = analysis.get_tokens()
  47. self.assertEqual(X, "friends will be friends".split())
  48. def test_get_tokens_empty(self):
  49. self.assertEqual(TestableStanfordAnalysis([]).get_tokens(),
  50. [])
  51. def test_get_tokens_invalid_data(self):
  52. with self.assertRaises(KeyError):
  53. TestableStanfordAnalysis([[{"aaa": "bbb"}]]).get_tokens()
  54. def test_get_token_offsets_simple(self):
  55. analysis = get_analysis_from_sent_markup("""
  56. x 1 x x
  57. x 4 x x
  58. x 8 x x
  59. x 3 x x
  60. """)
  61. X = analysis.get_token_offsets()
  62. self.assertEqual(X, [1, 4, 8, 3])
  63. def test_get_token_offsets_empty(self):
  64. self.assertEqual(TestableStanfordAnalysis([]).get_token_offsets(), [])
  65. def test_get_token_offsets_invalid_data(self):
  66. with self.assertRaises(KeyError):
  67. TestableStanfordAnalysis([[{"aaa": "bbb"}]]).get_token_offsets()
  68. def test_sentence_boundaries_empty(self):
  69. self.assertEqual(TestableStanfordAnalysis([]).get_sentence_boundaries(), [0])
  70. def test_sentence_boundaries_simple(self):
  71. sentences = [
  72. sentence_factory("x x x x\n" * 3), # 3 words
  73. sentence_factory("x x x x\n" * 2), # 2 words
  74. sentence_factory("x x x x\n" * 4), # 4 words
  75. ]
  76. # 1st 2nd 3rd end
  77. expected = [0, 3, 3 + 2, 3 + 2 + 4]
  78. analysis = TestableStanfordAnalysis(sentences)
  79. self.assertEqual(analysis.get_sentence_boundaries(), expected)
  80. def test_offsets_and_tokens_work_togheter(self):
  81. sentences = [
  82. sentence_factory("a x x x\n" * 3), # 3 words
  83. sentence_factory("b x x x\n" * 2), # 2 words
  84. sentence_factory("c x x x\n" * 4), # 4 words
  85. sentence_factory("d x x x\n" * 5), # 5 words
  86. ]
  87. analysis = TestableStanfordAnalysis(sentences)
  88. words = analysis.get_tokens()
  89. offsets = analysis.get_sentence_boundaries()
  90. self.assertEqual(len(words), offsets[-1])
  91. self.assertEqual(words[offsets[1]], "b")
  92. self.assertEqual(words[offsets[1] - 1], "a")
  93. self.assertEqual(words[offsets[3]], "d")
  94. self.assertEqual(words[offsets[3] - 1], "c")
  95. def test_get_entity_occurrences_simple(self):
  96. a = sentence_factory("""
  97. a 0 O x
  98. b 1 B x
  99. c 2 O x
  100. """)
  101. b = sentence_factory("""
  102. d 3 O x
  103. e 4 O x
  104. f 5 O x
  105. """)
  106. c = sentence_factory("""
  107. g 6 D x
  108. h 7 D x
  109. i 8 O x
  110. j 9 O x
  111. k 10 H x
  112. """)
  113. d = sentence_factory("""
  114. l 11 H x
  115. m 12 O x
  116. n 13 O x
  117. o 14 L x
  118. """)
  119. analysis = TestableStanfordAnalysis([a, b, c, d])
  120. expected = [
  121. (1, 2, "B"), # this is the b letter in the first sentence
  122. (6, 8, "D"), # first two words of the third sentence
  123. (10, 11, "H"), # last word of the third sentence
  124. (11, 12, "H"), # first word of the fourth sentence
  125. (14, 15, "L"), # last word of the fourth sentence
  126. ]
  127. self.assertEqual(analysis.get_entity_occurrences(), expected)
  128. def test_get_lemmas_empty(self):
  129. self.assertEqual(TestableStanfordAnalysis([]).get_lemmas(), [])
  130. def test_get_lemmas_and_tokens_same_length(self):
  131. sentences = [
  132. sentence_factory("x x x x\n" * 3), # 3 words
  133. sentence_factory("x x x x\n" * 2), # 2 words
  134. sentence_factory("x x x x\n" * 4), # 4 words
  135. sentence_factory("x x x x\n" * 5), # 5 words
  136. ]
  137. analysis = TestableStanfordAnalysis(sentences)
  138. tokens = analysis.get_tokens()
  139. lemmas = analysis.get_lemmas()
  140. self.assertEqual(len(tokens), len(lemmas))
  141. class TestPreProcessCall(ManagerTestCase):
  142. def _doc_creator(self, mark_as_done):
  143. doc = SentencedIEDocFactory()
  144. for step in mark_as_done:
  145. setattr(doc, "{}_done_at".format(step.name), datetime.now())
  146. doc.save()
  147. return doc
  148. def setUp(self):
  149. pps = PreProcessSteps
  150. self._all_steps = [
  151. pps.tokenization,
  152. pps.sentencer,
  153. pps.tagging,
  154. pps.ner,
  155. pps.lemmatization,
  156. pps.syntactic_parsing
  157. ]
  158. patcher = mock.patch("iepy.preprocess.corenlp.get_analizer")
  159. self.mock_get_analizer = patcher.start()
  160. self.mock_analizer = self.mock_get_analizer.return_value
  161. self.addCleanup(patcher.stop)
  162. self.stanfordpp = StanfordPreprocess()
  163. def test_if_all_steps_are_done_then_no_step_is_run(self):
  164. doc = self._doc_creator(mark_as_done=self._all_steps)
  165. self.stanfordpp(doc)
  166. self.assertFalse(self.mock_analizer.analyse.called)
  167. def test_if_all_steps_are_done_but_in_override_mode_then_all_are_run_again(self):
  168. doc = self._doc_creator(mark_as_done=self._all_steps[:])
  169. self.mock_analizer.analyse.return_value = {}
  170. self.stanfordpp.override = True
  171. self.stanfordpp(doc)
  172. self.assertTrue(self.mock_analizer.analyse.called)
  173. def test_for_new_doc_all_steps_are_done_when_preprocessed(self):
  174. doc = IEDocFactory()
  175. self.mock_analizer.analyse.return_value = {}
  176. self.stanfordpp(doc)
  177. for step in self._all_steps:
  178. self.assertTrue(doc.was_preprocess_step_done(step))
  179. def test_lemmatization_is_run_even_all_others_already_did(self):
  180. # On release 0.9.1 lemmatization was added. This checks it's possible to
  181. # increment older preprocessed docs
  182. doc_no_lemmas = self._doc_creator(
  183. [s for s in self._all_steps if s is not PreProcessSteps.lemmatization])
  184. with mock.patch.object(self.stanfordpp, "lemmatization_only") as mock_lemmatization:
  185. mock_lemmatization.side_effect = lambda x: None
  186. self.stanfordpp(doc_no_lemmas)
  187. self.assertTrue(mock_lemmatization.called)
  188. def test_syntactic_parsing_is_run_even_all_others_already_did(self):
  189. # On release 0.9.2 syntac parsing was added. This checks it's possible to
  190. # increment older preprocessed docs
  191. doc_no_synparse = self._doc_creator(
  192. [s for s in self._all_steps if s is not PreProcessSteps.syntactic_parsing])
  193. with mock.patch.object(self.stanfordpp, "syntactic_parsing_only") as mock_synparse:
  194. mock_synparse.side_effect = lambda x: None
  195. self.stanfordpp(doc_no_synparse)
  196. self.assertTrue(mock_synparse.called)
  197. def test_can_add_ner_on_incremental_mode_over_already_preprocessed_documents(self):
  198. doc_done = self._doc_creator(mark_as_done=self._all_steps)
  199. doc_fresh = IEDocFactory()
  200. self.stanfordpp.increment_ner = True
  201. p = lambda x: mock.patch.object(self.stanfordpp, x)
  202. with p("increment_ner_only") as mock_ner_only:
  203. with p("run_everything") as mock_run_everything:
  204. self.stanfordpp(doc_done)
  205. self.assertEqual(mock_ner_only.call_count, 1)
  206. self.assertEqual(mock_run_everything.call_count, 0)
  207. self.stanfordpp(doc_fresh)
  208. self.assertEqual(mock_ner_only.call_count, 1)
  209. self.assertEqual(mock_run_everything.call_count, 1)
  210. class TestGazetteer(ManagerTestCase):
  211. def test_generate_gazettes_file_empty(self):
  212. self.assertEqual(GazetteManager().generate_stanford_gazettes_file(), None)
  213. def _test_single_gazette(self, text=None):
  214. if text:
  215. gazette_item = GazetteItemFactory(text=text)
  216. else:
  217. gazette_item = GazetteItemFactory()
  218. gzmanager = GazetteManager()
  219. filepath = gzmanager.generate_stanford_gazettes_file()
  220. self.assertNotEqual(filepath, None)
  221. data = open(filepath).read().split("\n")
  222. self.assertEqual(len(data), 2)
  223. data = data[0].split("\t")
  224. self.assertEqual(len(data), 3)
  225. self.assertEqual(data[0], gzmanager.escape_text(gazette_item.text))
  226. self.assertEqual(data[1], "{}{}".format(gzmanager._PREFIX, gazette_item.kind.name))
  227. gazette_item.delete()
  228. def test_generate_gazettes_several_lines(self):
  229. [GazetteItemFactory() for x in range(10)]
  230. filepath = GazetteManager().generate_stanford_gazettes_file()
  231. self.assertNotEqual(filepath, None)
  232. data = open(filepath).read()
  233. self.assertEqual(data.count("\n"), 10)
  234. for line in data.split("\n")[:-1]:
  235. self.assertEqual(line.count("\t"), 2)
  236. def test_generate_gazettes_one_line(self):
  237. self._test_single_gazette()
  238. def test_gazettes_unicode(self):
  239. self._test_single_gazette("#½]}→@}#½ĸ@#")
  240. self._test_single_gazette("ħøłæ")
  241. self._test_single_gazette("æ}@ł¢µ«»µ«»“~þðøđþ")
  242. self._test_single_gazette("\ || \ ()(()))) \\ |")
  243. def test_gazettes_same_eo_has_same_entity(self):
  244. tokens = "The nominates were Stuart Little and Memento but the winner was Stuart Little".split()
  245. analysis = get_analysis_from_sent_markup(
  246. " ".join(["{} x x x\n".format(x) for x in tokens]))
  247. fake_gazetter = mock.MagicMock()
  248. fake_gazetter.was_entry_created_by_gazette.return_value = True
  249. with mock.patch.object(analysis, "get_entity_occurrences") as mock_eos:
  250. mock_eos.return_value = [
  251. (3, 5, "MOVIE"), # first occurrence of "Stuart Little"
  252. (6, 7, "MOVIE"), # occurrence of "Memento"
  253. (11, 13, "MOVIE"), # second occurrence of "Stuart Little"
  254. ]
  255. found_entities = analysis.get_found_entities('random_string', fake_gazetter)
  256. # Ok, so, Occurrences with same alias, if came from gazetter, are same Entity
  257. self.assertEqual(found_entities[0].key, found_entities[2].key)
  258. # But ofcourse, if have different aliases, not
  259. self.assertNotEqual(found_entities[0].key, found_entities[1].key)
  260. def test_escaping(self):
  261. texts = [
  262. "Maradona",
  263. "El Diego",
  264. "El Diego ( el 10 )",
  265. "|()|",
  266. "æßðæßð",
  267. "\ hello \ ",
  268. "*",
  269. ]
  270. gm = GazetteManager()
  271. for text in texts:
  272. escaped = gm.escape_text(text)
  273. self.assertEqual(escaped.count("\Q"), len(text.split()))
  274. self.assertEqual(escaped.count("\E"), len(text.split()))
  275. class TestMergeCorreferences(ManagerTestCase):
  276. def setUp(self):
  277. self.doc = SentencedIEDocFactory(
  278. text="Diego did it . He scored on the first half , and now he did it again . "
  279. #0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
  280. "Diego Maradona , the best player ever , won the game alone .")
  281. #18 19 20 21 22 23 24 25 26 27 28 29 30
  282. # mentions is a list of triplets (start, end, head) of each mention
  283. self.mentions = [(0, 1, 0), # Diego
  284. (4, 5, 4), # He
  285. (13, 14, 13), # he
  286. (18, 20, 18), # Diego Maradona
  287. (21, 25, 23), # the best player ever
  288. ]
  289. assert self.doc.entity_occurrences.count() == 0
  290. self.sample_ekind = EntityKindFactory()
  291. def merge(self, correfs):
  292. # runs the method on document
  293. return apply_coreferences(self.doc, correfs)
  294. def create_eo_with_mention(self, mention):
  295. return EntityOccurrenceFactory(
  296. document=self.doc, entity__kind=self.sample_ekind,
  297. offset=mention[0], offset_end=mention[1])
  298. def test_if_none_of_the_mentions_is_already_and_EO_nothing_happens(self):
  299. self.merge(self.mentions[:])
  300. self.assertEqual(self.doc.entity_occurrences.count(), 0)
  301. def test_merging_when_there_are_EOS_from_different_kind_fails(self):
  302. for m in self.mentions[:2]:
  303. eo = self.create_eo_with_mention(self.mentions[0])
  304. eo.entity.kind = EntityKindFactory()
  305. eo.entity.save()
  306. self.assertRaises(CoreferenceError, self.merge, self.mentions[:])
  307. def test_if_only_one_EO_existed_then_all_others_are_created_with_same_entity(self):
  308. original_eo = self.create_eo_with_mention(self.mentions[0])
  309. self.merge(self.mentions[:])
  310. self.assertEqual(self.doc.entity_occurrences.count(), len(self.mentions))
  311. for eo in self.doc.entity_occurrences.all():
  312. self.assertEqual(eo.entity, original_eo.entity)
  313. if eo != original_eo:
  314. self.assertTrue(eo.anaphora)
  315. def test_if_all_the_existent_EOs_are_from_anaphora_no_new_ones_are_created(self):
  316. original_eo = self.create_eo_with_mention(self.mentions[0])
  317. original_eo.anaphora = True
  318. original_eo.save()
  319. self.merge(self.mentions[:])
  320. self.assertEqual(self.doc.entity_occurrences.count(), 1)
  321. def test_if_coexist_EO_from_gazette_and_EO_from_NER_entity_of_the_later_is_used(self):
  322. eo_from_gz = self.create_eo_with_mention(self.mentions[0])
  323. eo_from_gz.entity.gazette = GazetteItemFactory()
  324. eo_from_gz.entity.save()
  325. eo_from_ner = self.create_eo_with_mention(self.mentions[1])
  326. expected_entity = eo_from_ner.entity
  327. self.merge(self.mentions[:])
  328. self.assertEqual(self.doc.entity_occurrences.count(), len(self.mentions))
  329. for eo in self.doc.entity_occurrences.all():
  330. # this will reload also eo_from_gz and eo_from_ner
  331. self.assertEqual(eo.entity, expected_entity)
  332. def test_if_coexist_several_EOs_from_NER_the_entity_of_first_is_used(self):
  333. eo_1 = self.create_eo_with_mention(self.mentions[0])
  334. eo_2 = self.create_eo_with_mention(self.mentions[1])
  335. assert eo_1.entity != eo_2.entity
  336. expected_entity = eo_1.entity
  337. self.merge(self.mentions[:])
  338. for eo in self.doc.entity_occurrences.all():
  339. # this will reload eo_1 and eo_2
  340. self.assertEqual(eo.entity, expected_entity)
  341. def test_cant_merge_several_EOs_from_different_GZ_items(self):
  342. eo_1 = self.create_eo_with_mention(self.mentions[0])
  343. eo_1.entity.gazette = GazetteItemFactory()
  344. eo_1.entity.save()
  345. eo_2 = self.create_eo_with_mention(self.mentions[1])
  346. eo_2.entity.gazette = GazetteItemFactory()
  347. eo_2.entity.save()
  348. self.assertRaises(CoreferenceError, self.merge, self.mentions[:])