123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262 |
- from copy import copy
- import inspect
- import logging
- import pickle
- import random
- import os.path
- import numpy
- from sklearn.cross_validation import StratifiedKFold
- from sklearn.metrics import precision_recall_curve
- from iepy import defaults
- from iepy.extraction.relation_extraction_classifier import RelationExtractionClassifier
- logger = logging.getLogger(__name__)
- HIPREC = (10, 1) # Precision is 10x more important than recall
- HIREC = (1, 2) # Recall is 2x more important than precision
- class ActiveLearningCore:
- """
- IEPY's main class. Implements an active learning information extraction
- pipeline.
- From the user's point of view this class is meant to be used like this::
- extractor = ActiveLearningCore(relation, lbl_evidences)
- extractor.start() # blocking
- while UserIsNotTired and extractor.questions:
- question = extractor.questions[0]
- answer = ask_user(question)
- extractor.add_answer(question, answer)
- extractor.process()
- predictions = extractor.predict() # profit
- """
- #
- # IEPY User API
- #
- def __init__(self, relation, labeled_evidences, extractor_config=None,
- tradeoff=None, extractor=None, classifier=None):
- if extractor is None:
- extractor = RelationExtractionClassifier
- self.extractor = extractor
- self.relation = relation
- self.classifier = classifier
- self._setup_labeled_evidences(labeled_evidences)
- self._questions = list(self.candidate_evidence)
- if extractor_config is None:
- extractor_config = defaults.extractor_config
- self.extractor_config = extractor_config
- self.tradeoff = tradeoff
- self.aimed_tradeoff = None
- self.threshold = None
- _DUMPED_ATTRS = ['relation', 'extractor', 'extractor_config', 'classifier',
- 'tradeoff', 'aimed_tradeoff', 'threshold']
- def save(self, file_path):
- if os.path.exists(file_path):
- raise ValueError("Output file path already exists")
- to_dump = [getattr(self, attr) for attr in self._DUMPED_ATTRS]
- with open(file_path, 'wb') as filehandler:
- pickle.dump(to_dump, filehandler)
- @classmethod
- def load(cls, file_path, **kwargs):
- if not os.path.exists(file_path):
- raise ValueError("File does not exists")
- with open(file_path, 'rb') as filehandler:
- data = pickle.load(filehandler)
- loading_kwargs = copy(kwargs)
- if 'labeled_evidences' not in kwargs:
- loading_kwargs['labeled_evidences'] = {}
- after = {}
- specs = inspect.getargspec(cls)
- for attr, value in zip(cls._DUMPED_ATTRS, data):
- if attr in specs.args:
- loading_kwargs[attr] = value
- else:
- after[attr] = value
- self = cls(**loading_kwargs)
- for after_attr, value in after.items():
- print ('Setting ' + after_attr)
- setattr(self, after_attr, value)
- return self
- def start(self):
- """
- Organizes the internal information, and prepares the first "questions" that
- need to be answered.
- """
- # API compliance. Nothing is done on current implementation.s
- pass
- @property
- def questions(self):
- """Returns a list of candidate evidences that would be good to have
- labels for.
- Order is important: labels for evidences listed firsts are more valuable.
- """
- return self._questions
- def add_answer(self, evidence, answer):
- """
- Not blocking.
- Informs to the Core the evidence label (True or False) decided
- from the outside.
- """
- assert answer in (True, False)
- self.labeled_evidence[evidence] = answer
- for list_ in (self._questions, self.candidate_evidence): # TODO: Check performance. Should use set?
- list_.remove(evidence)
- def process(self):
- """
- Blocking.
- With all the labeled evidences, new questions are generated, optimizing the
- future gain of having those evidences labeled.
- After calling this method the values returned by `questions`
- and `predict` will change.
- """
- yesno = set(self.labeled_evidence.values())
- if len(yesno) > 2:
- msg = "Evidence is not binary! Can't proceed."
- logger.error(msg)
- raise ValueError(msg)
- if len(yesno) < 2:
- logger.debug("Not enough labels to train.")
- return
- if self.tradeoff:
- self.estimate_threshold()
- self.train_relation_classifier()
- self.rank_candidate_evidence()
- self.choose_questions()
- def predict(self, candidates):
- """
- Using the internal trained classifier, all candidate evicence are automatically
- labeled.
- Returns a dict {evidence: True/False}, where the boolean label indicates if
- the relation is present on that evidence or not.
- """
- if not self.classifier:
- logger.info("There is no trained classifier. Can't predict")
- return {}
- # for every already labeled candidate, instead of asking the classifier we'll use
- # the actual label
- knowns = copy(self.labeled_evidence)
- to_predict = [c for c in candidates if c not in knowns]
- if self.threshold is None:
- labels = self.classifier.predict(to_predict)
- else:
- scores = self.classifier.decision_function(to_predict)
- labels = scores >= self.threshold
- prediction = dict(zip(to_predict, map(bool, labels)))
- prediction.update(knowns)
- return prediction
- def estimate_threshold(self):
- scores, y_true = self.get_kfold_data()
- if scores is None:
- return
- prec, rec, thres = precision_recall_curve(y_true, scores)
- prec[-1] = 0.0 # To avoid choosing the last phony value
- c_prec, c_rec = self.tradeoff
- # Below is a linear function on precision and recall, expressed using
- # numpy notation, we're maximizing it.
- i = (prec * c_prec + rec * c_rec).argmax() # Index of the maximum
- assert i < len(thres) # Because prec[-1] is 0.0
- self.aimed_tradeoff = (prec[i], rec[i])
- self.threshold = thres[i]
- s = "Using {} samples, threshold aiming at precision={:.4f} and recall={:.4f}"
- logger.debug(s.format(len(scores), prec[i], rec[i]))
- # Instance attributes:
- # questions: A list of evidence
- # ranked_candidate_evidence: A dict candidate_evidence -> float
- # aimed_tradeoff: A (prec, rec) tuple with the precision/recall tradeoff
- # that the threshold aims to achieve.
- #
- # Private methods
- #
- def _setup_labeled_evidences(self, labeled_evidences):
- self.candidate_evidence = []
- self.labeled_evidence = {}
- for e, lbl in labeled_evidences.items():
- e.relation = self.relation
- if lbl is None:
- self.candidate_evidence.append(e)
- else:
- self.labeled_evidence[e] = bool(lbl)
- if not self.candidate_evidence:
- raise ValueError("Cannot start core without candidate evidence")
- logger.info("Loaded {} candidate evidence and {} labeled evidence".format(
- len(self.candidate_evidence), len(self.labeled_evidence)))
- def train_relation_classifier(self):
- X = []
- y = []
- for evidence, score in self.labeled_evidence.items():
- X.append(evidence)
- y.append(int(score))
- assert y[-1] in (True, False)
- self.classifier = self.extractor(**self.extractor_config)
- self.classifier.fit(X, y)
- def rank_candidate_evidence(self):
- if not self.candidate_evidence:
- self.ranked_candidate_evidence = {}
- logger.info("No evidence left to rank.")
- return
- N = min(10 * len(self.labeled_evidence), len(self.candidate_evidence))
- logger.info("Ranking a sample of {} candidate evidence".format(N))
- sample = random.sample(self.candidate_evidence, N)
- ranks = self.classifier.decision_function(sample)
- self.ranked_candidate_evidence = dict(zip(sample, ranks))
- ranks = [abs(x) for x in ranks]
- logger.info("Ranking completed, lowest absolute rank={}, "
- "highest absolute rank={}".format(min(ranks), max(ranks)))
- def choose_questions(self):
- # Criteria: Answer first candidates with decision function near 0
- # because they are the most uncertain for the classifier.
- self._questions = sorted(self.ranked_candidate_evidence,
- key=lambda x: abs(self.ranked_candidate_evidence[x]))
- def get_kfold_data(self):
- """
- Perform k-fold cross validation and return (scores, y_true) where
- scores is a numpy array with decision function scores and y_true
- is a numpy array with the true label for that evidence.
- """
- allX = []
- ally = []
- for evidence, score in self.labeled_evidence.items():
- allX.append(evidence)
- ally.append(int(score))
- assert ally[-1] in (True, False)
- allX = numpy.array(allX)
- ally = numpy.array(ally)
- if numpy.bincount(ally).min() < 5:
- return None, None # Too little data to do 5-fold cross validation
- logger.debug("Performing 5-fold cross validation")
- scores = []
- y_true = []
- for train_index, test_index in StratifiedKFold(ally, 5):
- X = allX[train_index]
- y = ally[train_index]
- c = self.extractor(**self.extractor_config)
- c.fit(X, y)
- y_true.append(ally[test_index])
- scores.append(c.decision_function(allX[test_index]))
- return numpy.hstack(scores), numpy.hstack(y_true)
|