test_tokenizer.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. from unittest import TestCase
  2. import nltk
  3. from nltk.tokenize.punkt import PunktSentenceTokenizer
  4. try:
  5. from unittest import mock
  6. except ImportError:
  7. import mock
  8. from iepy.preprocess.tokenizer import en_tokenize_and_segment, _get_tokenizer
  9. class TestTokenization(TestCase):
  10. def check_expected_words_are_in_tokenization(self, text, expected_words):
  11. words = en_tokenize_and_segment(text)['tokens']
  12. for expected_word in expected_words:
  13. self.assertIn(expected_word, words)
  14. def test_point_between_words_is_captured(self):
  15. text = u"The dog is hungry.The cat is evil."
  16. expected = [u"dog", u"hungry", u"evil", u"."]
  17. self.check_expected_words_are_in_tokenization(text, expected)
  18. def test_hours_are_not_splitted(self):
  19. text = u"It's 3:39 am, what do you want?"
  20. expected = [u'3:39']
  21. self.check_expected_words_are_in_tokenization(text, expected)
  22. def test_apostrophs_its_contraction_is_not_splitted(self):
  23. text = u"It's 3:39 am, what do you want?"
  24. expected = [u"It's"]
  25. self.check_expected_words_are_in_tokenization(text, expected)
  26. def test_question_mark_is_splitted(self):
  27. text = u"It's 3:39 am, what do you want?"
  28. expected = [u"want", u"?"]
  29. self.check_expected_words_are_in_tokenization(text, expected)
  30. def test_web_address_is_not_splitted(self):
  31. text = u"Visit http://google.com"
  32. expected = [u"http://google.com"]
  33. self.check_expected_words_are_in_tokenization(text, expected)
  34. def test_complex_address_is_not_splitted(self):
  35. text = u"Try with ssh://tom@hawk:2020 and tell me"
  36. expected = [u"ssh://tom@hawk:2020"]
  37. self.check_expected_words_are_in_tokenization(text, expected)
  38. def test_Im_arent_and_dont_contraction_apostrophes_are_not_splitted(self):
  39. text = u"I'm ready for you all. Aren't you ready?. Don't you?"
  40. expected = [u"I'm", "Aren't", u"Don't"]
  41. self.check_expected_words_are_in_tokenization(text, expected)
  42. def test_hyphen_dates_arent_splitted(self):
  43. text = u"Back to 10-23-1984 but not to 23/10/1984"
  44. expected = [u'10-23-1984']
  45. self.check_expected_words_are_in_tokenization(text, expected)
  46. def test_slashed_dates_are_splitted(self):
  47. text = u"Back to 23/10/1984"
  48. expected = [u"10", u"23", u"1984"]
  49. self.check_expected_words_are_in_tokenization(text, expected)
  50. def test_hyphened_words_are_not_splitted(self):
  51. text = u"User-friendliness is a must, use get_text."
  52. expected = [u'User-friendliness']
  53. self.check_expected_words_are_in_tokenization(text, expected)
  54. def test_underscore_words_are_not_splitted(self):
  55. text = u"User-friendliness is a must, use get_text."
  56. expected = [u'get_text']
  57. self.check_expected_words_are_in_tokenization(text, expected)
  58. def test_colon_is_splitted(self):
  59. text = u"read what I have to say:I like turtles."
  60. expected = [u'say', u':', u'I']
  61. self.check_expected_words_are_in_tokenization(text, expected)
  62. def test_possesive_apostroph_IS_splitted(self):
  63. text = u"John's bar is cool."
  64. expected = [u'John', u"'s", u'cool']
  65. self.check_expected_words_are_in_tokenization(text, expected)
  66. def test_emoticons_detection(self):
  67. text = u"John's bar is cool, right :) XD?"
  68. expected = [u':)', u'XD', u'?']
  69. self.check_expected_words_are_in_tokenization(text, expected)
  70. def test_parenthesis_are_splitted(self):
  71. text = u"The wolf (starved to death), killed a duck."
  72. expected = [u'(', u'starved', u'death', u')', u',']
  73. self.check_expected_words_are_in_tokenization(text, expected)
  74. class TestTokensOffsets(TestCase):
  75. def test_there_is_an_offset_per_token(self):
  76. text = u"The wolf (starved to death), killed a duck."
  77. tokens = en_tokenize_and_segment(text)['tokens']
  78. offsets = en_tokenize_and_segment(text)['spans']
  79. self.assertEqual(len(tokens), len(offsets))
  80. def test_each_offset_its_the_exac_location_pf_the_token_in_the_text(self):
  81. text = (u"John's bar is cool, right :) XD? "
  82. u"The wolf (starved to death), killed a duck."
  83. )
  84. tokens = en_tokenize_and_segment(text)['tokens']
  85. offsets = en_tokenize_and_segment(text)['spans']
  86. for tkn, off in zip(tokens, offsets):
  87. self.assertEqual(text[off:len(tkn)+off], tkn)
  88. class TestSegmentation(TestCase):
  89. """If N sentences are found, N+1 numbers are returned, where the (i, i+1)
  90. numbers represent the start and end (in tokens) of the i-th sentence.
  91. """
  92. def test_cero_is_always_included(self):
  93. text = "The wolf killed a duck. What a pitty"
  94. sents = en_tokenize_and_segment(text)['sentences']
  95. self.assertEqual(sents[0], 0)
  96. def test_cero_is_all_even_if_no_tokens(self):
  97. text = ""
  98. sents = en_tokenize_and_segment(text)['sentences']
  99. self.assertEqual(sents, [0])
  100. def test_number_of_tokens_is_always_last(self):
  101. text = "The wolf killed a duck. What a pitty"
  102. pieces = en_tokenize_and_segment(text)
  103. sents = pieces['sentences']
  104. tkns = pieces['tokens']
  105. self.assertEqual(sents[-1], len(tkns))
  106. def test_nltk_punk_sentence_tokenizer_is_used(self):
  107. text = "The wolf killed a duck. What a pitty"
  108. with mock.patch.object(PunktSentenceTokenizer, 'span_tokenize') as nltk_sent:
  109. nltk_sent.return_value = [(0, 5)]
  110. en_tokenize_and_segment(text)
  111. nltk_sent.assert_called_once_with(text)
  112. def test_sentences_with_big_text(self):
  113. text = (u"The Bastard Operator From Hell (BOFH), a fictional character "
  114. u"created by Simon Travaglia, is a rogue system administrator who "
  115. u"takes out his anger on users (often referred to as lusers), "
  116. u"colleagues, bosses, and anyone else who pesters him with their "
  117. u"pitiful user created \"problems\".\n"
  118. u"The BOFH stories were originally posted in 1992 to Usenet by "
  119. u"Travaglia, with some being reprinted in Datamation. They were "
  120. u"published weekly from 1995 to 1999 in Network Week and since 2000"
  121. u" they have been published most weeks in The Register. They were "
  122. u"also published in PC Plus magazine for a short time, and several"
  123. u" books of the stories have also been released.")
  124. tokenizer = _get_tokenizer()
  125. expected_sentences = [0]
  126. sentence_splitter = nltk.data.load("tokenizers/punkt/english.pickle")
  127. for i, j in sentence_splitter.span_tokenize(text):
  128. expected_sentences.append(len(list(tokenizer.span_tokenize(text[:j]))))
  129. sents = en_tokenize_and_segment(text)['sentences']
  130. self.assertEqual(expected_sentences, sents)