luojiehua
/
iepy-develop


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
							#!/usr/bin/env python
# -*- Mode: Python; tab-width: 4; indent-tabs-mode: nil; coding: utf-8; -*-
# vim:set ft=python ts=4 sw=4 sts=4 autoindent:

# Verification of BioNLP Shared Task - style annotations.


import annotation
from projectconfig import ProjectConfiguration

# Issue types. Values should match with annotation interface.
AnnotationError = "AnnotationError"
AnnotationWarning = "AnnotationWarning"
AnnotationIncomplete = "AnnotationIncomplete"


class AnnotationIssue:
    """Represents an issue noted in verification of annotations."""

    _next_id_idx = 1

    def __init__(self, ann_id, type, description=""):
        self.id = "#%d" % AnnotationIssue._next_id_idx
        AnnotationIssue._next_id_idx += 1
        self.ann_id, self.type, self.description = ann_id, type, description
        if self.description is None:
            self.description = ""

    def human_readable_str(self):
        return "%s: %s\t%s" % (self.ann_id, self.type, self.description)

    def __str__(self):
        return "%s\t%s %s\t%s" % (
            self.id, self.type, self.ann_id, self.description)


def event_nonum_args(e):
    """Given an EventAnnotatation, returns its arguments without trailing
    numbers (e.g. "Theme1" -> "Theme")."""
    from re import match as re_match

    nna = {}
    for arg, aid in e.args:
        m = re_match(r'^(.*?)\d*$', arg)
        if m:
            arg = m.group(1)
        if arg not in nna:
            nna[arg] = []
        nna[arg].append(aid)
    return nna


def event_nonum_arg_count(e):
    """Given an EventAnnotation, returns a dictionary containing for each of
    its argument without trailing numbers (e.g. "Theme1" -> "Theme") the number
    of times the argument appears."""
    from re import match as re_match

    nnc = {}
    for arg, aid in e.args:
        m = re_match(r'^(.*?)\d*$', arg)
        if m:
            arg = m.group(1)
        nnc[arg] = nnc.get(arg, 0) + 1
    return nnc


def check_textbound_overlap(anns):
    """Checks for overlap between the given TextBoundAnnotations.

    Returns a list of pairs of overlapping annotations.
    """
    overlapping = []

    for a1 in anns:
        for a2 in anns:
            if a1 is a2:
                continue
            if (a2.first_start() < a1.last_end() and
                    a2.last_end() > a1.first_start()):
                overlapping.append((a1, a2))

    return overlapping


def verify_equivs(ann_obj, projectconf):
    issues = []

    # shortcut
    def disp(s):
        return projectconf.preferred_display_form(s)

    for eq in ann_obj.get_equivs():
        # get the equivalent annotations
        equiv_anns = [ann_obj.get_ann_by_id(eid) for eid in eq.entities]

        # all pairs of entity types in the Equiv group must be allowed
        # to have an Equiv. Create type-level pairs to avoid N^2
        # search where N=entities.
        eq_type = {}
        for e in equiv_anns:
            eq_type[e.type] = True
        type_pairs = []
        for t1 in eq_type:
            for t2 in eq_type:
                type_pairs.append((t1, t2))

        # do avoid marking both (a1,a2) and (a2,a1), remember what's
        # already included
        marked = {}

        for t1, t2 in type_pairs:
            reltypes = projectconf.relation_types_from_to(t1, t2)
            # TODO: this is too convoluted; use projectconf directly
            equiv_type_found = False
            for rt in reltypes:
                if projectconf.is_equiv_type(rt):
                    equiv_type_found = True
            if not equiv_type_found:
                # Avoid redundant output
                if (t2, t1) in marked:
                    continue
                # TODO: mark this error on the Eq relation, not the entities
                for e in equiv_anns:
                    issues.append(
                        AnnotationIssue(
                            e.id, AnnotationError, "Equivalence relation %s not allowed between %s and %s" %
                            (eq.type, disp(t1), disp(t2))))
                marked[(t1, t2)] = True

    return issues


def verify_entity_overlap(ann_obj, projectconf):
    issues = []

    # shortcut
    def disp(s):
        return projectconf.preferred_display_form(s)

    # check for overlap between physical entities
    physical_entities = [a for a in ann_obj.get_textbounds(
    ) if projectconf.is_physical_entity_type(a.type)]
    overlapping = check_textbound_overlap(physical_entities)
    for a1, a2 in overlapping:
        if a1.same_span(a2):
            if not projectconf.spans_can_be_equal(a1.type, a2.type):
                issues.append(
                    AnnotationIssue(
                        a1.id, AnnotationError, "Error: %s cannot have identical span with %s %s" %
                        (disp(
                            a1.type), disp(
                            a2.type), a2.id)))
        elif a2.contains(a1):
            if not projectconf.span_can_contain(a1.type, a2.type):
                issues.append(
                    AnnotationIssue(
                        a1.id, AnnotationError, "Error: %s cannot be contained in %s (%s)" %
                        (disp(
                            a1.type), disp(
                            a2.type), a2.id)))
        elif a1.contains(a2):
            if not projectconf.span_can_contain(a2.type, a1.type):
                issues.append(
                    AnnotationIssue(
                        a1.id, AnnotationError, "Error: %s cannot contain %s (%s)" %
                        (disp(
                            a1.type), disp(
                            a2.type), a2.id)))
        else:
            if not projectconf.spans_can_cross(a1.type, a2.type):
                issues.append(
                    AnnotationIssue(
                        a1.id,
                        AnnotationError,
                        "Error: annotation cannot have crossing span with %s" %
                        a2.id))

    # TODO: generalize to other cases
    return issues


def verify_annotation_types(ann_obj, projectconf):
    issues = []

    event_types = projectconf.get_event_types()
    textbound_types = event_types + projectconf.get_entity_types()
    relation_types = projectconf.get_relation_types()

    # shortcut
    def disp(s):
        return projectconf.preferred_display_form(s)

    for e in ann_obj.get_events():
        if e.type not in event_types:
            issues.append(
                AnnotationIssue(
                    e.id,
                    AnnotationError,
                    "Error: %s is not a known event type (check configuration?)" %
                    disp(
                        e.type)))

    for t in ann_obj.get_textbounds():
        if t.type not in textbound_types:
            issues.append(
                AnnotationIssue(
                    t.id,
                    AnnotationError,
                    "Error: %s is not a known textbound type (check configuration?)" %
                    disp(
                        t.type)))

    for r in ann_obj.get_relations():
        if r.type not in relation_types:
            issues.append(
                AnnotationIssue(
                    r.id,
                    AnnotationError,
                    "Error: %s is not a known relation type (check configuration?)" %
                    disp(
                        r.type)))

    return issues


def verify_triggers(ann_obj, projectconf):
    issues = []

    events_by_trigger = {}

    for e in ann_obj.get_events():
        if e.trigger not in events_by_trigger:
            events_by_trigger[e.trigger] = []
        events_by_trigger[e.trigger].append(e)

    trigger_by_span_and_type = {}

    for t in ann_obj.get_textbounds():
        if not projectconf.is_event_type(t.type):
            continue

        if t.id not in events_by_trigger:
            issues.append(
                AnnotationIssue(
                    t.id,
                    AnnotationIncomplete,
                    "Warning: trigger %s is not referenced from any event" %
                    t.id))

        spt = tuple(set(t.spans)) + (t.type,)
        if spt not in trigger_by_span_and_type:
            trigger_by_span_and_type[spt] = []
        trigger_by_span_and_type[spt].append(t)

    for spt in trigger_by_span_and_type:
        trigs = trigger_by_span_and_type[spt]
        if len(trigs) < 2:
            continue
        for t in trigs:
            # We currently need to attach these to events if there are
            # any; issues attached to triggers referenced from events
            # don't get shown. TODO: revise once this is fixed.
            if t.id in events_by_trigger:
                issues.append(
                    AnnotationIssue(
                        events_by_trigger[
                            t.id][0].id,
                        AnnotationWarning,
                        "Warning: triggers %s have identical span and type (harmless but unnecessary duplication)" %
                        ",".join(
                            [
                                x.id for x in trigs])))
            else:
                issues.append(
                    AnnotationIssue(
                        t.id,
                        AnnotationWarning,
                        "Warning: triggers %s have identical span and type (harmless but unnecessary duplication)" %
                        ",".join(
                            [
                                x.id for x in trigs])))

    return issues


def _relation_labels_match(rel, rel_conf):
    if len(rel_conf.arg_list) != 2:
        # likely misconfigured relation, can't match
        return False
    return (rel.arg1l == rel_conf.arg_list[0] and
            rel.arg2l == rel_conf.arg_list[1])


def verify_relations(ann_obj, projectconf):
    issues = []

    # shortcut
    def disp(s):
        return projectconf.preferred_display_form(s)

    # TODO: rethink this function.
    for r in ann_obj.get_relations():
        a1 = ann_obj.get_ann_by_id(r.arg1)
        a2 = ann_obj.get_ann_by_id(r.arg2)
        match_found = False

        # check for argument order a1, a2
        if r.type in projectconf.relation_types_from_to(a1.type, a2.type):
            # found for argument order a1, a2; check labels
            conf_rels = projectconf.get_relations_by_type(r.type)
            if any(c for c in conf_rels if _relation_labels_match(r, c)):
                match_found = True
        if match_found:
            continue

        # no match for argument order a1, a2; try a2, a1
        # temp inversion for check
        r.arg1, r.arg2, r.arg1l, r.arg2l = r.arg2, r.arg1, r.arg2l, r.arg1l
        if r.type in projectconf.relation_types_from_to(a2.type, a1.type):
            conf_rels = projectconf.get_relations_by_type(r.type)
            if any(c for c in conf_rels if _relation_labels_match(r, c)):
                match_found = True
        r.arg1, r.arg2, r.arg1l, r.arg2l = r.arg2, r.arg1, r.arg2l, r.arg1l
        if match_found:
            continue

        # not found for either argument order
        issues.append(
            AnnotationIssue(
                r.id, AnnotationError, "Error: %s relation %s:%s %s:%s not allowed" %
                (disp(
                    r.type), r.arg1l, disp(
                    a1.type), r.arg2l, disp(
                    a2.type))))

    return issues


def verify_missing_arguments(ann_obj, projectconf):
    """Checks for events having too few mandatory arguments."""
    issues = []

    # shortcut
    def disp(s):
        return projectconf.preferred_display_form(s)

    for e in ann_obj.get_events():
        nonum_arg_counts = event_nonum_arg_count(e)
        for m in projectconf.mandatory_arguments(e.type):
            c = nonum_arg_counts.get(m, 0)
            amin = projectconf.argument_minimum_count(e.type, m)
            amax = projectconf.argument_maximum_count(e.type, m)
            if c < amin:
                # insufficient, pick appropriate string and add issue
                if amin == 1:
                    countstr = "one %s argument " % disp(m)
                else:
                    countstr = "%d %s arguments " % (amin, disp(m))
                if amin == amax:
                    countstr = "exactly " + countstr
                else:
                    countstr = "at least " + countstr
                issues.append(
                    AnnotationIssue(
                        e.id,
                        AnnotationIncomplete,
                        "Incomplete: " +
                        countstr +
                        "required for event"))

    return issues


def verify_disallowed_arguments(ann_obj, projectconf):
    """Checks for events with arguments they are not allowed to have."""
    issues = []

    # shortcut
    def disp(s):
        return projectconf.preferred_display_form(s)

    for e in ann_obj.get_events():
        allowed = projectconf.arc_types_from(e.type)
        eargs = event_nonum_args(e)
        for a in eargs:
            if a not in allowed:
                issues.append(
                    AnnotationIssue(
                        e.id,
                        AnnotationError,
                        "Error: %s cannot take a %s argument" %
                        (disp(
                            e.type),
                            disp(a))))
            else:
                for rid in eargs[a]:
                    r = ann_obj.get_ann_by_id(rid)
                    if a not in projectconf.arc_types_from_to(e.type, r.type):
                        issues.append(
                            AnnotationIssue(
                                e.id, AnnotationError, "Error: %s argument %s cannot be of type %s" %
                                (disp(
                                    e.type), disp(a), disp(
                                    r.type))))

    return issues


def verify_extra_arguments(ann_obj, projectconf):
    """Checks for events with excessively many allowed arguments."""
    issues = []

    # shortcut
    def disp(s):
        return projectconf.preferred_display_form(s)

    for e in ann_obj.get_events():
        nonum_arg_counts = event_nonum_arg_count(e)
        multiple_allowed = projectconf.multiple_allowed_arguments(e.type)
        for a in [m for m in nonum_arg_counts if nonum_arg_counts[m] > 1]:
            amax = projectconf.argument_maximum_count(e.type, a)
            if a not in multiple_allowed:
                issues.append(
                    AnnotationIssue(
                        e.id,
                        AnnotationError,
                        "Error: %s cannot take multiple %s arguments" %
                        (disp(
                            e.type),
                            disp(a))))
            elif nonum_arg_counts[a] > amax:
                issues.append(
                    AnnotationIssue(
                        e.id,
                        AnnotationError,
                        "Error: %s can take at most %d %s arguments" %
                        (disp(
                            e.type),
                            amax,
                            disp(a))))

    return issues


def verify_attributes(ann_obj, projectconf):
    """Checks for instances of attributes attached to annotations that are not
    allowed to have them."""
    issues = []

    # shortcut
    def disp(s):
        return projectconf.preferred_display_form(s)

    for a in ann_obj.get_attributes():
        tid = a.target
        t = ann_obj.get_ann_by_id(tid)
        allowed = projectconf.attributes_for(t.type)

        if a.type not in allowed:
            issues.append(
                AnnotationIssue(
                    t.id, AnnotationError, "Error: %s cannot take a %s attribute" %
                    (disp(
                        t.type), disp(
                        a.type))))

    return issues


def verify_annotation(ann_obj, projectconf):
    """Verifies the correctness of a given AnnotationFile.

    Returns a list of AnnotationIssues.
    """
    issues = []

    issues += verify_annotation_types(ann_obj, projectconf)

    issues += verify_equivs(ann_obj, projectconf)

    issues += verify_entity_overlap(ann_obj, projectconf)

    issues += verify_triggers(ann_obj, projectconf)

    issues += verify_relations(ann_obj, projectconf)

    issues += verify_missing_arguments(ann_obj, projectconf)

    issues += verify_disallowed_arguments(ann_obj, projectconf)

    issues += verify_extra_arguments(ann_obj, projectconf)

    issues += verify_attributes(ann_obj, projectconf)

    return issues


def argparser():
    import argparse

    ap = argparse.ArgumentParser(
        description="Verify BioNLP Shared Task annotations.")
    ap.add_argument(
        "-v",
        "--verbose",
        default=False,
        action="store_true",
        help="Verbose output.")
    ap.add_argument(
        "files",
        metavar="FILE",
        nargs="+",
        help="Files to verify.")
    return ap


def main(argv=None):
    import sys
    import os

    if argv is None:
        argv = sys.argv
    arg = argparser().parse_args(argv[1:])

    for fn in arg.files:
        try:
            projectconf = ProjectConfiguration(os.path.dirname(fn))
            # remove ".a2" or ".rel" suffixes for Annotations to prompt
            # parsing of .a1 also.
            # (TODO: temporarily removing .ann also to work around a
            # bug in TextAnnotations, but this should not be necessary.)
            nosuff_fn = fn.replace(
                ".a2",
                "").replace(
                ".rel",
                "").replace(
                ".ann",
                "")
            with annotation.TextAnnotations(nosuff_fn) as ann_obj:
                issues = verify_annotation(ann_obj, projectconf)
                for i in issues:
                    print("%s:\t%s" % (fn, i.human_readable_str()))
        except annotation.AnnotationFileNotFoundError:
            print("%s:\tFailed check: file not found" % fn, file=sys.stderr)
        except annotation.AnnotationNotFoundError as e:
            print("%s:\tFailed check: %s" % (fn, e), file=sys.stderr)

    if arg.verbose:
        print("Check complete.", file=sys.stderr)


if __name__ == "__main__":
    import sys
    sys.exit(main())