luojiehua
/
iepy-develop


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
							"""
Run IEPY active-learning extractor

Usage:
    iepy_runner.py [options] <relation_name> <output>
    iepy_runner.py [options] --db-store <relation_name>
    iepy_runner.py -h | --help | --version

Options:
  --store-extractor=<extractor_output>     Stores the trained classifier
  --trained-extractor=<extractor_path>     Load an already trained extractor
  --db-store                               Stores the predictions on the database
  --no-questions                           Won't generate questions to answer. Will predict
                                           as is. Should be used with --trained-extractor
  --tune-for=<tune-for>                    Predictions tuning. Options are high-prec
                                           or high-recall [default: high-prec]
  --extractor-config=<config.json>         Sets the extractor config
  --version                                Version number
  -h --help                                Show this screen
"""

import os
import json
import logging
from docopt import docopt
from sys import exit

import iepy
INSTANCE_PATH = iepy.setup(__file__)

from iepy.extraction.active_learning_core import ActiveLearningCore, HIPREC, HIREC
from iepy.data.db import CandidateEvidenceManager
from iepy.data.models import Relation
from iepy.extraction.terminal import TerminalAdministration
from iepy.data import output


def print_all_relations():
    print("All available relations:")
    for relation in Relation.objects.all():
        print("  {}".format(relation))


def load_labeled_evidences(relation, evidences):
    CEM = CandidateEvidenceManager  # shorcut
    return CEM.labels_for(relation, evidences, CEM.conflict_resolution_newest_wins)


def _get_tuning_mode(opts):
    if opts['--tune-for'] == 'high-prec':
        tuning_mode = HIPREC
    elif opts['--tune-for'] == 'high-recall':
        tuning_mode = HIREC
    else:
        print ('Invalid tuning mode')
        print (__doc__)
        exit(1)
    return tuning_mode


def _get_relation(opts):
    relation_name = opts['<relation_name>']
    try:
        relation = Relation.objects.get(name=relation_name)
    except Relation.DoesNotExist:
        print("Relation {!r} non existent".format(relation_name))
        print_all_relations()
        exit(1)
    return relation


def _load_extractor(opts, relation, labeled_evidences):
    extractor_path = opts.get('--trained-extractor')
    try:
        iextractor = ActiveLearningCore.load(extractor_path,
                                             labeled_evidences=labeled_evidences)
    except ValueError:
        print("Error: unable to load extractor, invalid file")
        exit(1)

    if iextractor.relation != relation:
        print('The loaded extractor is not for the requested relation'
              ' but for relation {} instead'.format(iextractor.relation))
        exit(1)
    print('Extractor successfully loaded')
    return iextractor


def _construct_extractor(opts, relation, labeled_evidences, tuning_mode):
    config_filepath = opts.get("--extractor-config")
    if not config_filepath:
        config_filepath = os.path.join(INSTANCE_PATH, "extractor_config.json")

    if not os.path.exists(config_filepath):
        print("Error: extractor config does not exists, please create the "
              "file extractor_config.json or use the --extractor-config")
        exit(1)

    with open(config_filepath) as filehandler:
        try:
            extractor_config = json.load(filehandler)
        except Exception as error:
            print("Error: unable to load extractor config: {}".format(error))
            exit(1)

    iextractor = ActiveLearningCore(
        relation, labeled_evidences, extractor_config, tradeoff=tuning_mode
    )
    return iextractor


def run_from_command_line():
    opts = docopt(__doc__, version=iepy.__version__)

    logging.basicConfig(level=logging.INFO, format='%(message)s')
    logging.getLogger("featureforge").setLevel(logging.WARN)

    tuning_mode = _get_tuning_mode(opts)
    relation = _get_relation(opts)

    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
    labeled_evidences = load_labeled_evidences(relation, candidates)

    if opts.get('--trained-extractor'):
        iextractor = _load_extractor(opts, relation, labeled_evidences)
        was_ever_trained = True
        opts["--no-questions"] = True
    else:
        iextractor = _construct_extractor(opts, relation, labeled_evidences, tuning_mode)
        iextractor.start()
        was_ever_trained = False

    if not opts.get("--no-questions", False):
        questions_loop(iextractor, relation, was_ever_trained)

    # Candidates generator was consumed when generating labeled_evidences, so we'll
    # define it fresh again
    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
    # Predict and store output
    predictions = iextractor.predict(candidates)  # asking predictions for EVERYTHING
    if not predictions:
        print("Nothing was predicted")
        exit(1)

    if opts.get("--db-store"):
        output.dump_predictions_to_database(relation, predictions)

    output_file = opts.get("<output>")
    if output_file:
        output.dump_runner_output_to_csv(predictions, output_file)

    classifier_output = opts.get("--store-extractor")
    if classifier_output:
        iextractor.save(classifier_output)


def questions_loop(iextractor, relation, was_ever_trained):
    STOP = u'STOP'
    term = TerminalAdministration(
        relation,
        extra_options=[(STOP, u'Stop execution')]
    )
    while iextractor.questions:
        questions = list(iextractor.questions)  # copying the list
        term.update_candidate_evidences_to_label(questions)
        result = term()
        i = 0
        for c, label_value in load_labeled_evidences(relation, questions).items():
            if label_value is not None:
                iextractor.add_answer(c, label_value)
                i += 1
        print ('Added %s new human labels to the extractor core' % i)
        iextractor.process()
        was_ever_trained = True
        if result == STOP:
            break

    if not was_ever_trained:
        # It's needed to run some process before asking for predictions
        iextractor.process()


if __name__ == u'__main__':
    run_from_command_line()