relation_extraction_classifier.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. from sklearn.linear_model import SGDClassifier
  2. from sklearn.neighbors import KNeighborsClassifier
  3. from sklearn.svm import SVC
  4. from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
  5. from sklearn.preprocessing import StandardScaler
  6. from sklearn.pipeline import make_pipeline, make_union
  7. from featureforge.vectorizer import Vectorizer
  8. from iepy.extraction.features import parse_features
  9. _valid_classifiers = {
  10. "sgd": SGDClassifier,
  11. "knn": KNeighborsClassifier,
  12. "svc": SVC,
  13. "randomforest": RandomForestClassifier,
  14. "adaboost": AdaBoostClassifier,
  15. }
  16. _configuration_options = """
  17. classifier
  18. classifier_args
  19. sparse_features
  20. dense_features
  21. """.split()
  22. class RelationExtractionClassifier:
  23. def __init__(self, **config):
  24. # Validate options are present
  25. for option in _configuration_options:
  26. if option not in config:
  27. raise ValueError("Missing configuration "
  28. "option {!r}".format(option))
  29. # Feature extraction
  30. sparse_features = parse_features(config["sparse_features"])
  31. densifier = make_pipeline(Vectorizer(sparse_features, sparse=True),
  32. ClassifierAsFeature())
  33. dense_features = parse_features(config["dense_features"])
  34. vectorization = make_union(densifier,
  35. Vectorizer(dense_features, sparse=False))
  36. # Classifier
  37. try:
  38. classifier = _valid_classifiers[config["classifier"]]
  39. except KeyError:
  40. raise ValueError("Unknown classification algorithm "
  41. "{!r}".format(config["classifier"]))
  42. classifier = classifier(**config["classifier_args"])
  43. self.pipeline = make_pipeline(vectorization, StandardScaler())
  44. self.classifier = classifier
  45. def fit(self, X, y):
  46. X = self.pipeline.fit_transform(X, y)
  47. self.classifier.fit(X, y)
  48. return self
  49. def _chew(self, evidences):
  50. return self.pipeline.transform(evidences)
  51. def _predict(self, X):
  52. return self.classifier.predict(X)
  53. def _rank(self, X):
  54. return self.classifier.decision_function(X).ravel()
  55. def predict(self, evidences):
  56. return self._predict(self._chew(evidences))
  57. def decision_function(self, evidences):
  58. return self._rank(self._chew(evidences))
  59. class ClassifierAsFeature:
  60. """
  61. A transformation that esentially implements a form of dimensionality
  62. reduction.
  63. This class uses (by default) a fast SGDClassifier configured like a linear
  64. SVM to produce a feature that is the decision function of the classifier.
  65. It's useful to reduce the dimension of bag-of-words feature-set into a
  66. feature that's denser in information.
  67. """
  68. def __init__(self, classifier=None):
  69. if classifier is None:
  70. classifier = SGDClassifier()
  71. self.classifier = classifier
  72. def fit(self, X, y):
  73. """
  74. `X` is expected to be an array-like or a sparse matrix.
  75. `y` is expected to be an array-like containing the classes to learn.
  76. """
  77. self.classifier.fit(X, y)
  78. return self
  79. def transform(self, X, y=None):
  80. """
  81. `X` is expected to be an array-like or a sparse matrix.
  82. It returns a dense matrix of shape (n_samples, 1).
  83. """
  84. return self.classifier.decision_function(X).reshape(-1, 1)