1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- from unittest import TestCase
- from iepy.data.models import IEDocument
- from iepy.preprocess.ner.literal import LiteralNER, LiteralNERRunner
- from iepy.preprocess.pipeline import PreProcessSteps
- from .factories import SentencedIEDocFactory, NamedTemporaryFile23
- from .manager_case import ManagerTestCase
- from .test_ner import NERTestMixin
- NEW_ENTITIES = ['DISEASE', 'MEDICAL_TEST']
- class TestLiteralNER(TestCase):
- def setUp(self):
- f = NamedTemporaryFile23(mode="w", encoding="utf8")
- f.write('HIV\nHepatitis C\nbrain tumor\ndrooling\n')
- f.seek(0)
- self.tmp_file1 = f
- f = NamedTemporaryFile23(mode="w", encoding="utf8")
- f.write('MRI\nCT scan\ndrooling\n')
- f.seek(0)
- self.tmp_file2 = f
- def test_tagging(self):
- tagger = LiteralNER(NEW_ENTITIES,
- [self.tmp_file1.name, self.tmp_file2.name])
- s = "Chase notes she's negative for HIV and Hepatitis C"
- result = tagger.tag(s.split())
- tags = [tag for _, tag in result]
- expected_tags = 'O O O O O DISEASE O DISEASE DISEASE'.split()
- self.assertEqual(tags, expected_tags)
- s = ("Cuddy points out that the CT scan showed the patient has a metal "
- "pin in her arm and can't undergo an MRI")
- result = tagger.tag(s.split())
- tags = [tag for _, tag in result]
- expected_tags = 'O O O O O MEDICAL_TEST MEDICAL_TEST O O O O O O O O O O O O O O MEDICAL_TEST'.split()
- self.assertEqual(tags, expected_tags)
- s = "CT scan said HIV MRI Hepatitis C"
- result = tagger.tag(s.split())
- tags = [tag for _, tag in result]
- expected_tags = 'MEDICAL_TEST MEDICAL_TEST O DISEASE MEDICAL_TEST DISEASE DISEASE'.split()
- self.assertEqual(tags, expected_tags)
- def test_entities(self):
- tagger = LiteralNER(NEW_ENTITIES,
- [self.tmp_file1.name, self.tmp_file2.name])
- s = "Chase notes she's negative for HIV and Hepatitis C"
- result = tagger.entities(s.split())
- expected_entities = [((5, 6), 'DISEASE'), ((7, 9), 'DISEASE')]
- self.assertEqual(result, expected_entities)
- s = "Cuddy points out that the CT scan showed the patient has a metal pin in her arm and can't undergo an MRI"
- result = tagger.entities(s.split())
- expected_entities = [((5, 7), 'MEDICAL_TEST'), ((21, 22), 'MEDICAL_TEST')]
- self.assertEqual(result, expected_entities)
- s = "CT scan said HIV MRI Hepatitis C"
- result = tagger.entities(s.split())
- expected_entities = [((0, 2), 'MEDICAL_TEST'), ((3, 4), 'DISEASE'),
- ((4, 5), 'MEDICAL_TEST'), ((5, 7), 'DISEASE')]
- self.assertEqual(result, expected_entities)
- class TestLiteralNERRunner(ManagerTestCase, NERTestMixin):
- def setUp(self):
- f = NamedTemporaryFile23(mode="w", encoding="utf8")
- f.write('HIV\nHepatitis C\nbrain tumor\ndrooling\n')
- f.seek(0)
- self.tmp_file1 = f
- f = NamedTemporaryFile23(mode="w", encoding="utf8")
- f.write('MRI\nCT scan\ndrooling\n')
- f.seek(0)
- self.tmp_file2 = f
- def test(self):
- doc = SentencedIEDocFactory(
- text="Chase notes she's negative for HIV and Hepatitis C")
- lit_tagger_runner = LiteralNERRunner(['disease'], [self.tmp_file1.name])
- lit_tagger_runner(doc)
- # (the tokenizer splits she's in two parts)
- entities_triples = [(6, 7, 'DISEASE'), (8, 10, 'DISEASE')]
- self.check_ner_result(doc, entities_triples)
|