test_features.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. # -*- coding: utf-8 -*-
  2. import refo
  3. from unittest import skip, mock
  4. from featureforge.validate import BaseFeatureFixture, EQ
  5. from featureforge.feature import make_feature
  6. from iepy.data.db import CandidateEvidenceManager
  7. from iepy.extraction import features
  8. from iepy.extraction.rules import rule, Token
  9. from iepy.extraction.features import (
  10. bag_of_words, bag_of_pos, bag_of_word_bigrams, bag_of_wordpos,
  11. bag_of_wordpos_bigrams, bag_of_words_in_between, bag_of_pos_in_between,
  12. bag_of_word_bigrams_in_between, bag_of_wordpos_in_between,
  13. bag_of_wordpos_bigrams_in_between, entity_order, entity_distance,
  14. other_entities_in_between, total_number_of_entities,
  15. verbs_count_in_between, verbs_count, symbols_in_between,
  16. parse_features, in_between_offsets,
  17. )
  18. from .factories import EvidenceFactory, RelationFactory
  19. from .manager_case import ManagerTestCase
  20. def _e(markup, **kwargs):
  21. base_pos = kwargs.pop('base_pos', ["DT", u"JJ", u"NN"])
  22. base_lemmas = kwargs.pop('base_lemmas', None)
  23. evidence = EvidenceFactory(markup=markup, **kwargs)
  24. evidence = CandidateEvidenceManager.hydrate(evidence)
  25. if base_lemmas is None:
  26. base_lemmas = [x.lower() for x in evidence.segment.tokens]
  27. n = len(evidence.segment.tokens)
  28. pos = (base_pos * n)[:n]
  29. evidence.segment.postags = pos
  30. evidence.segment.lemmas = base_lemmas
  31. return evidence
  32. class FeatureEvidenceBaseCase(BaseFeatureFixture):
  33. @skip(u"skipped because there's no random generation of Evidences")
  34. def test_fuzz(self):
  35. pass
  36. def test_fixtures(self):
  37. # here fixtures are database objects, so we are:
  38. # - force to construct them on runtime, (ie, not when parsing tests classes)
  39. # - better to construct them only once
  40. fixtures = {}
  41. for label, (markup, predicate, value) in self.fixtures.items():
  42. if callable(markup):
  43. datapoint = markup()
  44. else:
  45. datapoint = _e(markup)
  46. fixtures[label] = datapoint, predicate, value
  47. self.assert_feature_passes_fixture(self.feature, fixtures)
  48. class TestBagOfWords(ManagerTestCase, FeatureEvidenceBaseCase):
  49. feature = make_feature(bag_of_words)
  50. fixtures = dict(
  51. test_eq1=(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  52. EQ, set(u"drinking mate makes you go to the toilet".split())),
  53. test_eq2=(u"Drinking",
  54. EQ, set(u"drinking".split())),
  55. test_eq3=(u"",
  56. EQ, set())
  57. )
  58. class TestBagOfPos(ManagerTestCase, FeatureEvidenceBaseCase):
  59. feature = make_feature(bag_of_pos)
  60. fixtures = dict(
  61. test_eq1=(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  62. EQ, set(u"DT JJ NN".split())),
  63. test_eq2=(u"Drinking",
  64. EQ, set(u"DT".split())),
  65. test_eq3=(u"",
  66. EQ, set())
  67. )
  68. class TestBagOfWordBigrams(ManagerTestCase, FeatureEvidenceBaseCase):
  69. feature = make_feature(bag_of_word_bigrams)
  70. fixtures = dict(
  71. test_eq1=(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  72. EQ, {(u"drinking", u"mate"), (u"mate", u"makes"), (u"makes", u"you"),
  73. (u"you", u"go"), (u"go", u"to"), (u"to", u"the"),
  74. (u"the", u"toilet")}),
  75. test_eq2=(u"Drinking mate",
  76. EQ, {(u"drinking", u"mate")}),
  77. test_eq3=(u"Drinking",
  78. EQ, set()),
  79. test_eq4=(u"",
  80. EQ, set())
  81. )
  82. class TestBagOfWordPos(ManagerTestCase, FeatureEvidenceBaseCase):
  83. feature = make_feature(bag_of_wordpos)
  84. fixtures = dict(
  85. test_eq1=(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  86. EQ, {(u"drinking", u"DT"), (u"mate", u"JJ"), (u"makes", u"NN"),
  87. (u"you", u"DT"), (u"go", u"JJ"), (u"to", u"NN"), (u"the", u"DT"),
  88. (u"toilet", u"JJ")}),
  89. test_eq2=(u"Drinking",
  90. EQ, {(u"drinking", u"DT")}),
  91. test_eq3=(u"",
  92. EQ, set())
  93. )
  94. class TestBagOfWordPosBigrams(ManagerTestCase, FeatureEvidenceBaseCase):
  95. feature = make_feature(bag_of_wordpos_bigrams)
  96. fixtures = dict(
  97. test_eq1=(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  98. EQ, {((u"drinking", u"DT"), (u"mate", u"JJ")),
  99. ((u"mate", u"JJ"), (u"makes", u"NN")),
  100. ((u"makes", u"NN"), (u"you", u"DT")),
  101. ((u"you", u"DT"), (u"go", u"JJ")),
  102. ((u"go", u"JJ"), (u"to", u"NN")),
  103. ((u"to", u"NN"), (u"the", u"DT")),
  104. ((u"the", u"DT"), (u"toilet", u"JJ")),
  105. }),
  106. test_eq2=(u"Drinking mate",
  107. EQ, {((u"drinking", u"DT"), (u"mate", u"JJ"))}),
  108. test_eq3=(u"Drinking",
  109. EQ, set()),
  110. test_eq4=(u"",
  111. EQ, set())
  112. )
  113. class TestBagOfWordsInBetween(ManagerTestCase, FeatureEvidenceBaseCase):
  114. feature = make_feature(bag_of_words_in_between)
  115. fixtures = dict(
  116. test_eq1=(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  117. EQ, set(u"makes you go to the".split())),
  118. test_eq2=(u"Drinking {Mate|thing**} makes you go to the {toilet|thing*}",
  119. EQ, set(u"makes you go to the".split())),
  120. test_eq3=(u"Drinking {Mate|thing*} or {Tea|thing} makes you go to the {toilet|thing**}",
  121. EQ, set(u"or tea makes you go to the".split())),
  122. test_eq5=(u"{Mate|thing**} {toilet|thing*}",
  123. EQ, set()),
  124. )
  125. class TestBagOfPosInBetween(ManagerTestCase, FeatureEvidenceBaseCase):
  126. feature = make_feature(bag_of_pos_in_between)
  127. fixtures = dict(
  128. test_eq1=(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  129. EQ, set(u"DT JJ NN".split())),
  130. test_eq2=(u"Drinking {Mate|thing**} makes you go to the {toilet|thing*}",
  131. EQ, set(u"DT JJ NN".split())),
  132. test_eq3=(u"{Mate|thing**} {toilet|thing*}",
  133. EQ, set()),
  134. )
  135. class TestBagOfWordBigramsInBetween(ManagerTestCase, FeatureEvidenceBaseCase):
  136. feature = make_feature(bag_of_word_bigrams_in_between)
  137. fixtures = dict(
  138. test_eq1=(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  139. EQ, {(u"makes", u"you"), (u"you", u"go"), (u"go", u"to"), (u"to", u"the")}),
  140. test_eq2=(u"Drinking {Mate|thing**} makes you go to the {toilet|thing*}",
  141. EQ, {(u"makes", u"you"), (u"you", u"go"), (u"go", u"to"), (u"to", u"the")}),
  142. test_eq3=(u"{Mate|thing*} makes you {toilet|thing**}",
  143. EQ, {(u"makes", u"you")}),
  144. test_eq4=(u"{Mate|thing*} makes {toilet|thing**}",
  145. EQ, set()),
  146. test_eq6=(u"{Mate|thing**} {toilet|thing*}",
  147. EQ, set()),
  148. )
  149. class TestBagOfWordPosInBetween(ManagerTestCase, FeatureEvidenceBaseCase):
  150. feature = make_feature(bag_of_wordpos_in_between)
  151. fixtures = dict(
  152. test_eq1=(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  153. EQ, {(u"makes", u"NN"), (u"you", u"DT"), (u"go", u"JJ"), (u"to", u"NN"), (u"the", u"DT")}),
  154. test_eq2=(u"Drinking {Mate|thing**} makes you go to the {toilet|thing*}",
  155. EQ, {(u"makes", u"NN"), (u"you", u"DT"), (u"go", u"JJ"), (u"to", u"NN"), (u"the", u"DT")}),
  156. test_eq6=(u"{Mate|thing**} {toilet|thing*}",
  157. EQ, set()),
  158. )
  159. class TestBagOfWordPosBigramsInBetween(ManagerTestCase, FeatureEvidenceBaseCase):
  160. feature = make_feature(bag_of_wordpos_bigrams_in_between)
  161. fixtures = dict(
  162. test_eq1=(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  163. EQ, {((u"makes", u"NN"), (u"you", u"DT")),
  164. ((u"you", u"DT"), (u"go", u"JJ")),
  165. ((u"go", u"JJ"), (u"to", u"NN")),
  166. ((u"to", u"NN"), (u"the", u"DT")),
  167. }),
  168. test_eq2=(u"Drinking {Mate|thing**} makes you go to the {toilet|thing*}",
  169. EQ, {((u"makes", u"NN"), (u"you", u"DT")),
  170. ((u"you", u"DT"), (u"go", u"JJ")),
  171. ((u"go", u"JJ"), (u"to", u"NN")),
  172. ((u"to", u"NN"), (u"the", u"DT")),
  173. }),
  174. test_eq6=(u"{Mate|thing**} {toilet|thing*}",
  175. EQ, set()),
  176. )
  177. class TestEntityOrder(ManagerTestCase, FeatureEvidenceBaseCase):
  178. feature = make_feature(entity_order)
  179. fixtures = dict(
  180. test_lr=(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  181. EQ, 1),
  182. test_rl=(u"Drinking {Mate|thing**} makes you go to the {toilet|thing*}",
  183. EQ, 0),
  184. )
  185. class TestEntityDistance(ManagerTestCase, FeatureEvidenceBaseCase):
  186. feature = make_feature(entity_distance)
  187. fixtures = dict(
  188. test_lr=(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  189. EQ, 5),
  190. test_rl=(u"Drinking {Mate|thing**} makes you go to the {toilet|thing*}",
  191. EQ, 5),
  192. test_multiword=(u"Drinking {Argentinean Mate|thing*} the {toilet|thing**}",
  193. EQ, 1),
  194. test_zero=(u"Drinking {Argentinean Mate|thing*} {toilet|thing**}",
  195. EQ, 0),
  196. )
  197. class TestOtherEntitiesInBetween(ManagerTestCase, FeatureEvidenceBaseCase):
  198. feature = make_feature(other_entities_in_between)
  199. fixtures = dict(
  200. test_lr=(u"Drinking {Mate|thing*} makes {you|told} go to the {toilet|thing**}",
  201. EQ, 1),
  202. test_rl=(u"Drinking {Mate|thing**} makes {you|told} go to the {toilet|thing*}",
  203. EQ, 1),
  204. test_many=(u"Drinking {Mate|thing**} {makes|yeah} {you|told} {go|bad} {to|music} {the|aaa} {toilet|thing*}",
  205. EQ, 5),
  206. test_multiword=(u"Drinking {Argentinean Mate|thing*} {the|told} {toilet|thing**}",
  207. EQ, 1),
  208. test_zero=(u"Drinking {Argentinean Mate|thing*} {toilet|thing**}",
  209. EQ, 0),
  210. test_zero2=(u"Drinking {Argentinean Mate|thing*} the {toilet|thing**}",
  211. EQ, 0),
  212. )
  213. class TestTotalEntitiesNumber(ManagerTestCase, FeatureEvidenceBaseCase):
  214. feature = make_feature(total_number_of_entities)
  215. fixtures = dict(
  216. test_lr=(u"Drinking {Mate|thing*} makes {you|told} go to the {toilet|thing**}",
  217. EQ, 3),
  218. test_rl=(u"Drinking {Mate|thing**} makes {you|told} go to the {toilet|thing*}",
  219. EQ, 3),
  220. test_many=(u"Drinking {Mate|thing**} {makes|yeah} {you|told} {go|bad} {to|music} {the|aaa} {toilet|thing*}",
  221. EQ, 7),
  222. test_multiword=(u"Drinking {Argentinean Mate|thing*} {the|told} {toilet|thing**}",
  223. EQ, 3),
  224. test_zero=(u"Drinking {Argentinean Mate|thing*} {toilet|thing**}",
  225. EQ, 2),
  226. test_zero2=(u"Drinking {Argentinean Mate|thing*} the {toilet|thing**}",
  227. EQ, 2),
  228. )
  229. class TestVerbsInBetweenEntitiesCount(ManagerTestCase, FeatureEvidenceBaseCase):
  230. feature = make_feature(verbs_count_in_between)
  231. fixtures = dict(
  232. test_none=(
  233. lambda: _e(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  234. base_pos=["JJ"]),
  235. EQ, 0),
  236. test_all=(
  237. lambda: _e(u"Drinking {Mate|thing**} makes you go to the {toilet|thing*}",
  238. base_pos=["VB", u"VBD"]),
  239. EQ, 5),
  240. )
  241. class TestVerbsTotalCount(ManagerTestCase, FeatureEvidenceBaseCase):
  242. feature = make_feature(verbs_count)
  243. fixtures = dict(
  244. test_none=(
  245. lambda: _e(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  246. base_pos=["JJ"]),
  247. EQ, 0),
  248. test_all=(
  249. lambda: _e(u"Drinking {Argentinean Mate|thing**} makes you go to the {toilet|thing*}",
  250. base_pos=["VB", u"VBD"]),
  251. EQ, 9),
  252. test_empty=(u"",
  253. EQ, 0),
  254. test_no_entity=(u"Drinking mate yeah",
  255. EQ, 0),
  256. )
  257. class TestSymbolsInBetween(ManagerTestCase, FeatureEvidenceBaseCase):
  258. feature = make_feature(symbols_in_between)
  259. fixtures = dict(
  260. test_none=(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  261. EQ, 0),
  262. test_one=(u"Drinking {Mate|thing**}, makes you go to the {toilet|thing*}",
  263. EQ, 1),
  264. test_two=(u"Drinking {Mate|thing**}, makes you go, to the {toilet|thing*}",
  265. EQ, 1), # its only boolean
  266. )
  267. class TestLemmasInBetweenEntitiesCount(ManagerTestCase, FeatureEvidenceBaseCase):
  268. def lemmas_count_in_between(datapoint):
  269. i, j = in_between_offsets(datapoint)
  270. return len([x for x in datapoint.segment.lemmas[i:j]])
  271. feature = make_feature(lemmas_count_in_between)
  272. fixtures = dict(
  273. test_lemmas=(
  274. lambda: _e(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}"),
  275. EQ, 5),
  276. test_none=(
  277. lambda: _e(u"Drinking {Mate|thing*} {rocks|feeling**}"),
  278. EQ, 0),
  279. )
  280. class TestBagOfLemmas(ManagerTestCase, FeatureEvidenceBaseCase):
  281. def bag_of_lemmas(datapoint):
  282. return set(datapoint.segment.lemmas)
  283. feature = make_feature(bag_of_lemmas)
  284. fixtures = dict(
  285. test_lemmas=(
  286. lambda: _e(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}"),
  287. EQ, set("drinking mate makes you go to the toilet".split())),
  288. test_none=(
  289. lambda: _e(u""),
  290. EQ, set()),
  291. )
  292. class TestSyntacticTreeBagOfTags(ManagerTestCase, FeatureEvidenceBaseCase):
  293. def bag_of_tree_tags(datapoint):
  294. tags = set()
  295. to_explore = datapoint.segment.syntactic_sentences
  296. while to_explore:
  297. tree = to_explore.pop(0)
  298. if isinstance(tree, str): # leaf
  299. continue
  300. tags.add(tree.label())
  301. to_explore.extend(list(tree))
  302. return tags
  303. feature = make_feature(bag_of_tree_tags)
  304. fixtures = dict(
  305. test_empty=(
  306. lambda: _e(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}"),
  307. EQ, set()),
  308. test_one=(
  309. lambda: _e(
  310. u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  311. syntactic_sentence="""
  312. (ROOT
  313. (S
  314. (NP (NNP Drinking) (NNP Mate))
  315. (VP (VBZ makes)
  316. (S
  317. (NP (PRP you))
  318. (VP (VB go)
  319. (PP (TO to)
  320. (NP (DT the) (NN toilet))))))))
  321. """),
  322. EQ, set("ROOT S NP NNP VP VBZ PRP VB PP TO DT NN".split())),
  323. )
  324. class TestSyntacticTreeHeight(ManagerTestCase, FeatureEvidenceBaseCase):
  325. def tree_height(datapoint):
  326. heights = [x.height() for x in datapoint.segment.syntactic_sentences]
  327. return sum(heights)
  328. feature = make_feature(tree_height)
  329. fixtures = dict(
  330. test_empty=(
  331. lambda: _e(u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}"),
  332. EQ, 0),
  333. test_one=(
  334. lambda: _e(
  335. u"Drinking {Mate|thing*} makes you go to the {toilet|thing**}",
  336. syntactic_sentence="""
  337. (ROOT
  338. (S
  339. (NP (NNP Drinking) (NNP Mate))
  340. (VP (VBZ makes)
  341. (S
  342. (NP (PRP you))
  343. (VP (VB go)
  344. (PP (TO to)
  345. (NP (DT the) (NN toilet))))))))
  346. """),
  347. EQ, 9),
  348. )
  349. class MockedModule:
  350. def __init__(self):
  351. relation = RelationFactory(name="mockrelation")
  352. relation.save()
  353. self.RELATION = relation.name
  354. def custom_feature(*args, **kwargs):
  355. return "custom feature"
  356. @rule(True)
  357. def custom_rule_feature(*args, **kwargs):
  358. # always match
  359. return refo.Star(refo.Any())
  360. @rule(True)
  361. def custom_only_match_dot(*args, **kwargs):
  362. # only match dot
  363. return Token('.')
  364. @rule(False)
  365. def custom_negative_rule_feature(*args, **kwargs):
  366. # always matchs
  367. return refo.Star(refo.Any())
  368. @rule(False)
  369. def custom_negative_only_match_dot(*args, **kwargs):
  370. # only match dot
  371. return Token('.')
  372. class TestCustomFeatures(ManagerTestCase):
  373. def test_parse_custom_feature(self):
  374. with mock.patch("importlib.import_module") as mock_import:
  375. mocked_module = MockedModule()
  376. mock_import.return_value = mocked_module
  377. fs = parse_features(["app.module.custom_feature"])
  378. mock_import.assert_called_with("app.module")
  379. self.assertEqual(len(fs), 1)
  380. self.assertEqual(fs[0](), "custom feature")
  381. def test_parse_custom_rule_feature(self):
  382. with mock.patch("importlib.import_module") as mock_import:
  383. with mock.patch.object(features, "rule_wrapper") as mock_rule_wrapper:
  384. mocked_module = MockedModule()
  385. mock_import.return_value = mocked_module
  386. fs = parse_features(["app.rules.custom_rule_feature"])
  387. mock_import.assert_called_with("app.rules")
  388. self.assertEqual(len(fs), 1)
  389. self.assertTrue(mock_rule_wrapper.called)
  390. def test_invalid_custom_feature(self):
  391. with self.assertRaises(KeyError):
  392. parse_features(["does.not.exists"])
  393. def test_rule_wrapper_returns_int(self):
  394. with mock.patch("importlib.import_module") as mock_import:
  395. mocked_module = MockedModule()
  396. mock_import.return_value = mocked_module
  397. evidence = _e("test")
  398. fs = parse_features(["app.rules.custom_rule_feature"])
  399. self.assertEqual(fs[0](evidence), 1)
  400. fs = parse_features(["app.rules.custom_only_match_dot"])
  401. self.assertEqual(fs[0](evidence), 0)
  402. fs = parse_features(["app.rules.custom_negative_rule_feature"])
  403. self.assertEqual(fs[0](evidence), 1)
  404. fs = parse_features(["app.rules.custom_negative_only_match_dot"])
  405. self.assertEqual(fs[0](evidence), 0)