test_literal_ner.py 3.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. from unittest import TestCase
  2. from iepy.data.models import IEDocument
  3. from iepy.preprocess.ner.literal import LiteralNER, LiteralNERRunner
  4. from iepy.preprocess.pipeline import PreProcessSteps
  5. from .factories import SentencedIEDocFactory, NamedTemporaryFile23
  6. from .manager_case import ManagerTestCase
  7. from .test_ner import NERTestMixin
  8. NEW_ENTITIES = ['DISEASE', 'MEDICAL_TEST']
  9. class TestLiteralNER(TestCase):
  10. def setUp(self):
  11. f = NamedTemporaryFile23(mode="w", encoding="utf8")
  12. f.write('HIV\nHepatitis C\nbrain tumor\ndrooling\n')
  13. f.seek(0)
  14. self.tmp_file1 = f
  15. f = NamedTemporaryFile23(mode="w", encoding="utf8")
  16. f.write('MRI\nCT scan\ndrooling\n')
  17. f.seek(0)
  18. self.tmp_file2 = f
  19. def test_tagging(self):
  20. tagger = LiteralNER(NEW_ENTITIES,
  21. [self.tmp_file1.name, self.tmp_file2.name])
  22. s = "Chase notes she's negative for HIV and Hepatitis C"
  23. result = tagger.tag(s.split())
  24. tags = [tag for _, tag in result]
  25. expected_tags = 'O O O O O DISEASE O DISEASE DISEASE'.split()
  26. self.assertEqual(tags, expected_tags)
  27. s = ("Cuddy points out that the CT scan showed the patient has a metal "
  28. "pin in her arm and can't undergo an MRI")
  29. result = tagger.tag(s.split())
  30. tags = [tag for _, tag in result]
  31. expected_tags = 'O O O O O MEDICAL_TEST MEDICAL_TEST O O O O O O O O O O O O O O MEDICAL_TEST'.split()
  32. self.assertEqual(tags, expected_tags)
  33. s = "CT scan said HIV MRI Hepatitis C"
  34. result = tagger.tag(s.split())
  35. tags = [tag for _, tag in result]
  36. expected_tags = 'MEDICAL_TEST MEDICAL_TEST O DISEASE MEDICAL_TEST DISEASE DISEASE'.split()
  37. self.assertEqual(tags, expected_tags)
  38. def test_entities(self):
  39. tagger = LiteralNER(NEW_ENTITIES,
  40. [self.tmp_file1.name, self.tmp_file2.name])
  41. s = "Chase notes she's negative for HIV and Hepatitis C"
  42. result = tagger.entities(s.split())
  43. expected_entities = [((5, 6), 'DISEASE'), ((7, 9), 'DISEASE')]
  44. self.assertEqual(result, expected_entities)
  45. s = "Cuddy points out that the CT scan showed the patient has a metal pin in her arm and can't undergo an MRI"
  46. result = tagger.entities(s.split())
  47. expected_entities = [((5, 7), 'MEDICAL_TEST'), ((21, 22), 'MEDICAL_TEST')]
  48. self.assertEqual(result, expected_entities)
  49. s = "CT scan said HIV MRI Hepatitis C"
  50. result = tagger.entities(s.split())
  51. expected_entities = [((0, 2), 'MEDICAL_TEST'), ((3, 4), 'DISEASE'),
  52. ((4, 5), 'MEDICAL_TEST'), ((5, 7), 'DISEASE')]
  53. self.assertEqual(result, expected_entities)
  54. class TestLiteralNERRunner(ManagerTestCase, NERTestMixin):
  55. def setUp(self):
  56. f = NamedTemporaryFile23(mode="w", encoding="utf8")
  57. f.write('HIV\nHepatitis C\nbrain tumor\ndrooling\n')
  58. f.seek(0)
  59. self.tmp_file1 = f
  60. f = NamedTemporaryFile23(mode="w", encoding="utf8")
  61. f.write('MRI\nCT scan\ndrooling\n')
  62. f.seek(0)
  63. self.tmp_file2 = f
  64. def test(self):
  65. doc = SentencedIEDocFactory(
  66. text="Chase notes she's negative for HIV and Hepatitis C")
  67. lit_tagger_runner = LiteralNERRunner(['disease'], [self.tmp_file1.name])
  68. lit_tagger_runner(doc)
  69. # (the tokenizer splits she's in two parts)
  70. entities_triples = [(6, 7, 'DISEASE'), (8, 10, 'DISEASE')]
  71. self.check_ner_result(doc, entities_triples)