verify_annotations.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
  1. #!/usr/bin/env python
  2. # -*- Mode: Python; tab-width: 4; indent-tabs-mode: nil; coding: utf-8; -*-
  3. # vim:set ft=python ts=4 sw=4 sts=4 autoindent:
  4. # Verification of BioNLP Shared Task - style annotations.
  5. import annotation
  6. from projectconfig import ProjectConfiguration
  7. # Issue types. Values should match with annotation interface.
  8. AnnotationError = "AnnotationError"
  9. AnnotationWarning = "AnnotationWarning"
  10. AnnotationIncomplete = "AnnotationIncomplete"
  11. class AnnotationIssue:
  12. """Represents an issue noted in verification of annotations."""
  13. _next_id_idx = 1
  14. def __init__(self, ann_id, type, description=""):
  15. self.id = "#%d" % AnnotationIssue._next_id_idx
  16. AnnotationIssue._next_id_idx += 1
  17. self.ann_id, self.type, self.description = ann_id, type, description
  18. if self.description is None:
  19. self.description = ""
  20. def human_readable_str(self):
  21. return "%s: %s\t%s" % (self.ann_id, self.type, self.description)
  22. def __str__(self):
  23. return "%s\t%s %s\t%s" % (
  24. self.id, self.type, self.ann_id, self.description)
  25. def event_nonum_args(e):
  26. """Given an EventAnnotatation, returns its arguments without trailing
  27. numbers (e.g. "Theme1" -> "Theme")."""
  28. from re import match as re_match
  29. nna = {}
  30. for arg, aid in e.args:
  31. m = re_match(r'^(.*?)\d*$', arg)
  32. if m:
  33. arg = m.group(1)
  34. if arg not in nna:
  35. nna[arg] = []
  36. nna[arg].append(aid)
  37. return nna
  38. def event_nonum_arg_count(e):
  39. """Given an EventAnnotation, returns a dictionary containing for each of
  40. its argument without trailing numbers (e.g. "Theme1" -> "Theme") the number
  41. of times the argument appears."""
  42. from re import match as re_match
  43. nnc = {}
  44. for arg, aid in e.args:
  45. m = re_match(r'^(.*?)\d*$', arg)
  46. if m:
  47. arg = m.group(1)
  48. nnc[arg] = nnc.get(arg, 0) + 1
  49. return nnc
  50. def check_textbound_overlap(anns):
  51. """Checks for overlap between the given TextBoundAnnotations.
  52. Returns a list of pairs of overlapping annotations.
  53. """
  54. overlapping = []
  55. for a1 in anns:
  56. for a2 in anns:
  57. if a1 is a2:
  58. continue
  59. if (a2.first_start() < a1.last_end() and
  60. a2.last_end() > a1.first_start()):
  61. overlapping.append((a1, a2))
  62. return overlapping
  63. def verify_equivs(ann_obj, projectconf):
  64. issues = []
  65. # shortcut
  66. def disp(s):
  67. return projectconf.preferred_display_form(s)
  68. for eq in ann_obj.get_equivs():
  69. # get the equivalent annotations
  70. equiv_anns = [ann_obj.get_ann_by_id(eid) for eid in eq.entities]
  71. # all pairs of entity types in the Equiv group must be allowed
  72. # to have an Equiv. Create type-level pairs to avoid N^2
  73. # search where N=entities.
  74. eq_type = {}
  75. for e in equiv_anns:
  76. eq_type[e.type] = True
  77. type_pairs = []
  78. for t1 in eq_type:
  79. for t2 in eq_type:
  80. type_pairs.append((t1, t2))
  81. # do avoid marking both (a1,a2) and (a2,a1), remember what's
  82. # already included
  83. marked = {}
  84. for t1, t2 in type_pairs:
  85. reltypes = projectconf.relation_types_from_to(t1, t2)
  86. # TODO: this is too convoluted; use projectconf directly
  87. equiv_type_found = False
  88. for rt in reltypes:
  89. if projectconf.is_equiv_type(rt):
  90. equiv_type_found = True
  91. if not equiv_type_found:
  92. # Avoid redundant output
  93. if (t2, t1) in marked:
  94. continue
  95. # TODO: mark this error on the Eq relation, not the entities
  96. for e in equiv_anns:
  97. issues.append(
  98. AnnotationIssue(
  99. e.id, AnnotationError, "Equivalence relation %s not allowed between %s and %s" %
  100. (eq.type, disp(t1), disp(t2))))
  101. marked[(t1, t2)] = True
  102. return issues
  103. def verify_entity_overlap(ann_obj, projectconf):
  104. issues = []
  105. # shortcut
  106. def disp(s):
  107. return projectconf.preferred_display_form(s)
  108. # check for overlap between physical entities
  109. physical_entities = [a for a in ann_obj.get_textbounds(
  110. ) if projectconf.is_physical_entity_type(a.type)]
  111. overlapping = check_textbound_overlap(physical_entities)
  112. for a1, a2 in overlapping:
  113. if a1.same_span(a2):
  114. if not projectconf.spans_can_be_equal(a1.type, a2.type):
  115. issues.append(
  116. AnnotationIssue(
  117. a1.id, AnnotationError, "Error: %s cannot have identical span with %s %s" %
  118. (disp(
  119. a1.type), disp(
  120. a2.type), a2.id)))
  121. elif a2.contains(a1):
  122. if not projectconf.span_can_contain(a1.type, a2.type):
  123. issues.append(
  124. AnnotationIssue(
  125. a1.id, AnnotationError, "Error: %s cannot be contained in %s (%s)" %
  126. (disp(
  127. a1.type), disp(
  128. a2.type), a2.id)))
  129. elif a1.contains(a2):
  130. if not projectconf.span_can_contain(a2.type, a1.type):
  131. issues.append(
  132. AnnotationIssue(
  133. a1.id, AnnotationError, "Error: %s cannot contain %s (%s)" %
  134. (disp(
  135. a1.type), disp(
  136. a2.type), a2.id)))
  137. else:
  138. if not projectconf.spans_can_cross(a1.type, a2.type):
  139. issues.append(
  140. AnnotationIssue(
  141. a1.id,
  142. AnnotationError,
  143. "Error: annotation cannot have crossing span with %s" %
  144. a2.id))
  145. # TODO: generalize to other cases
  146. return issues
  147. def verify_annotation_types(ann_obj, projectconf):
  148. issues = []
  149. event_types = projectconf.get_event_types()
  150. textbound_types = event_types + projectconf.get_entity_types()
  151. relation_types = projectconf.get_relation_types()
  152. # shortcut
  153. def disp(s):
  154. return projectconf.preferred_display_form(s)
  155. for e in ann_obj.get_events():
  156. if e.type not in event_types:
  157. issues.append(
  158. AnnotationIssue(
  159. e.id,
  160. AnnotationError,
  161. "Error: %s is not a known event type (check configuration?)" %
  162. disp(
  163. e.type)))
  164. for t in ann_obj.get_textbounds():
  165. if t.type not in textbound_types:
  166. issues.append(
  167. AnnotationIssue(
  168. t.id,
  169. AnnotationError,
  170. "Error: %s is not a known textbound type (check configuration?)" %
  171. disp(
  172. t.type)))
  173. for r in ann_obj.get_relations():
  174. if r.type not in relation_types:
  175. issues.append(
  176. AnnotationIssue(
  177. r.id,
  178. AnnotationError,
  179. "Error: %s is not a known relation type (check configuration?)" %
  180. disp(
  181. r.type)))
  182. return issues
  183. def verify_triggers(ann_obj, projectconf):
  184. issues = []
  185. events_by_trigger = {}
  186. for e in ann_obj.get_events():
  187. if e.trigger not in events_by_trigger:
  188. events_by_trigger[e.trigger] = []
  189. events_by_trigger[e.trigger].append(e)
  190. trigger_by_span_and_type = {}
  191. for t in ann_obj.get_textbounds():
  192. if not projectconf.is_event_type(t.type):
  193. continue
  194. if t.id not in events_by_trigger:
  195. issues.append(
  196. AnnotationIssue(
  197. t.id,
  198. AnnotationIncomplete,
  199. "Warning: trigger %s is not referenced from any event" %
  200. t.id))
  201. spt = tuple(set(t.spans)) + (t.type,)
  202. if spt not in trigger_by_span_and_type:
  203. trigger_by_span_and_type[spt] = []
  204. trigger_by_span_and_type[spt].append(t)
  205. for spt in trigger_by_span_and_type:
  206. trigs = trigger_by_span_and_type[spt]
  207. if len(trigs) < 2:
  208. continue
  209. for t in trigs:
  210. # We currently need to attach these to events if there are
  211. # any; issues attached to triggers referenced from events
  212. # don't get shown. TODO: revise once this is fixed.
  213. if t.id in events_by_trigger:
  214. issues.append(
  215. AnnotationIssue(
  216. events_by_trigger[
  217. t.id][0].id,
  218. AnnotationWarning,
  219. "Warning: triggers %s have identical span and type (harmless but unnecessary duplication)" %
  220. ",".join(
  221. [
  222. x.id for x in trigs])))
  223. else:
  224. issues.append(
  225. AnnotationIssue(
  226. t.id,
  227. AnnotationWarning,
  228. "Warning: triggers %s have identical span and type (harmless but unnecessary duplication)" %
  229. ",".join(
  230. [
  231. x.id for x in trigs])))
  232. return issues
  233. def _relation_labels_match(rel, rel_conf):
  234. if len(rel_conf.arg_list) != 2:
  235. # likely misconfigured relation, can't match
  236. return False
  237. return (rel.arg1l == rel_conf.arg_list[0] and
  238. rel.arg2l == rel_conf.arg_list[1])
  239. def verify_relations(ann_obj, projectconf):
  240. issues = []
  241. # shortcut
  242. def disp(s):
  243. return projectconf.preferred_display_form(s)
  244. # TODO: rethink this function.
  245. for r in ann_obj.get_relations():
  246. a1 = ann_obj.get_ann_by_id(r.arg1)
  247. a2 = ann_obj.get_ann_by_id(r.arg2)
  248. match_found = False
  249. # check for argument order a1, a2
  250. if r.type in projectconf.relation_types_from_to(a1.type, a2.type):
  251. # found for argument order a1, a2; check labels
  252. conf_rels = projectconf.get_relations_by_type(r.type)
  253. if any(c for c in conf_rels if _relation_labels_match(r, c)):
  254. match_found = True
  255. if match_found:
  256. continue
  257. # no match for argument order a1, a2; try a2, a1
  258. # temp inversion for check
  259. r.arg1, r.arg2, r.arg1l, r.arg2l = r.arg2, r.arg1, r.arg2l, r.arg1l
  260. if r.type in projectconf.relation_types_from_to(a2.type, a1.type):
  261. conf_rels = projectconf.get_relations_by_type(r.type)
  262. if any(c for c in conf_rels if _relation_labels_match(r, c)):
  263. match_found = True
  264. r.arg1, r.arg2, r.arg1l, r.arg2l = r.arg2, r.arg1, r.arg2l, r.arg1l
  265. if match_found:
  266. continue
  267. # not found for either argument order
  268. issues.append(
  269. AnnotationIssue(
  270. r.id, AnnotationError, "Error: %s relation %s:%s %s:%s not allowed" %
  271. (disp(
  272. r.type), r.arg1l, disp(
  273. a1.type), r.arg2l, disp(
  274. a2.type))))
  275. return issues
  276. def verify_missing_arguments(ann_obj, projectconf):
  277. """Checks for events having too few mandatory arguments."""
  278. issues = []
  279. # shortcut
  280. def disp(s):
  281. return projectconf.preferred_display_form(s)
  282. for e in ann_obj.get_events():
  283. nonum_arg_counts = event_nonum_arg_count(e)
  284. for m in projectconf.mandatory_arguments(e.type):
  285. c = nonum_arg_counts.get(m, 0)
  286. amin = projectconf.argument_minimum_count(e.type, m)
  287. amax = projectconf.argument_maximum_count(e.type, m)
  288. if c < amin:
  289. # insufficient, pick appropriate string and add issue
  290. if amin == 1:
  291. countstr = "one %s argument " % disp(m)
  292. else:
  293. countstr = "%d %s arguments " % (amin, disp(m))
  294. if amin == amax:
  295. countstr = "exactly " + countstr
  296. else:
  297. countstr = "at least " + countstr
  298. issues.append(
  299. AnnotationIssue(
  300. e.id,
  301. AnnotationIncomplete,
  302. "Incomplete: " +
  303. countstr +
  304. "required for event"))
  305. return issues
  306. def verify_disallowed_arguments(ann_obj, projectconf):
  307. """Checks for events with arguments they are not allowed to have."""
  308. issues = []
  309. # shortcut
  310. def disp(s):
  311. return projectconf.preferred_display_form(s)
  312. for e in ann_obj.get_events():
  313. allowed = projectconf.arc_types_from(e.type)
  314. eargs = event_nonum_args(e)
  315. for a in eargs:
  316. if a not in allowed:
  317. issues.append(
  318. AnnotationIssue(
  319. e.id,
  320. AnnotationError,
  321. "Error: %s cannot take a %s argument" %
  322. (disp(
  323. e.type),
  324. disp(a))))
  325. else:
  326. for rid in eargs[a]:
  327. r = ann_obj.get_ann_by_id(rid)
  328. if a not in projectconf.arc_types_from_to(e.type, r.type):
  329. issues.append(
  330. AnnotationIssue(
  331. e.id, AnnotationError, "Error: %s argument %s cannot be of type %s" %
  332. (disp(
  333. e.type), disp(a), disp(
  334. r.type))))
  335. return issues
  336. def verify_extra_arguments(ann_obj, projectconf):
  337. """Checks for events with excessively many allowed arguments."""
  338. issues = []
  339. # shortcut
  340. def disp(s):
  341. return projectconf.preferred_display_form(s)
  342. for e in ann_obj.get_events():
  343. nonum_arg_counts = event_nonum_arg_count(e)
  344. multiple_allowed = projectconf.multiple_allowed_arguments(e.type)
  345. for a in [m for m in nonum_arg_counts if nonum_arg_counts[m] > 1]:
  346. amax = projectconf.argument_maximum_count(e.type, a)
  347. if a not in multiple_allowed:
  348. issues.append(
  349. AnnotationIssue(
  350. e.id,
  351. AnnotationError,
  352. "Error: %s cannot take multiple %s arguments" %
  353. (disp(
  354. e.type),
  355. disp(a))))
  356. elif nonum_arg_counts[a] > amax:
  357. issues.append(
  358. AnnotationIssue(
  359. e.id,
  360. AnnotationError,
  361. "Error: %s can take at most %d %s arguments" %
  362. (disp(
  363. e.type),
  364. amax,
  365. disp(a))))
  366. return issues
  367. def verify_attributes(ann_obj, projectconf):
  368. """Checks for instances of attributes attached to annotations that are not
  369. allowed to have them."""
  370. issues = []
  371. # shortcut
  372. def disp(s):
  373. return projectconf.preferred_display_form(s)
  374. for a in ann_obj.get_attributes():
  375. tid = a.target
  376. t = ann_obj.get_ann_by_id(tid)
  377. allowed = projectconf.attributes_for(t.type)
  378. if a.type not in allowed:
  379. issues.append(
  380. AnnotationIssue(
  381. t.id, AnnotationError, "Error: %s cannot take a %s attribute" %
  382. (disp(
  383. t.type), disp(
  384. a.type))))
  385. return issues
  386. def verify_annotation(ann_obj, projectconf):
  387. """Verifies the correctness of a given AnnotationFile.
  388. Returns a list of AnnotationIssues.
  389. """
  390. issues = []
  391. issues += verify_annotation_types(ann_obj, projectconf)
  392. issues += verify_equivs(ann_obj, projectconf)
  393. issues += verify_entity_overlap(ann_obj, projectconf)
  394. issues += verify_triggers(ann_obj, projectconf)
  395. issues += verify_relations(ann_obj, projectconf)
  396. issues += verify_missing_arguments(ann_obj, projectconf)
  397. issues += verify_disallowed_arguments(ann_obj, projectconf)
  398. issues += verify_extra_arguments(ann_obj, projectconf)
  399. issues += verify_attributes(ann_obj, projectconf)
  400. return issues
  401. def argparser():
  402. import argparse
  403. ap = argparse.ArgumentParser(
  404. description="Verify BioNLP Shared Task annotations.")
  405. ap.add_argument(
  406. "-v",
  407. "--verbose",
  408. default=False,
  409. action="store_true",
  410. help="Verbose output.")
  411. ap.add_argument(
  412. "files",
  413. metavar="FILE",
  414. nargs="+",
  415. help="Files to verify.")
  416. return ap
  417. def main(argv=None):
  418. import sys
  419. import os
  420. if argv is None:
  421. argv = sys.argv
  422. arg = argparser().parse_args(argv[1:])
  423. for fn in arg.files:
  424. try:
  425. projectconf = ProjectConfiguration(os.path.dirname(fn))
  426. # remove ".a2" or ".rel" suffixes for Annotations to prompt
  427. # parsing of .a1 also.
  428. # (TODO: temporarily removing .ann also to work around a
  429. # bug in TextAnnotations, but this should not be necessary.)
  430. nosuff_fn = fn.replace(
  431. ".a2",
  432. "").replace(
  433. ".rel",
  434. "").replace(
  435. ".ann",
  436. "")
  437. with annotation.TextAnnotations(nosuff_fn) as ann_obj:
  438. issues = verify_annotation(ann_obj, projectconf)
  439. for i in issues:
  440. print("%s:\t%s" % (fn, i.human_readable_str()))
  441. except annotation.AnnotationFileNotFoundError:
  442. print("%s:\tFailed check: file not found" % fn, file=sys.stderr)
  443. except annotation.AnnotationNotFoundError as e:
  444. print("%s:\tFailed check: %s" % (fn, e), file=sys.stderr)
  445. if arg.verbose:
  446. print("Check complete.", file=sys.stderr)
  447. if __name__ == "__main__":
  448. import sys
  449. sys.exit(main())