simstring_pure_overlap.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435
  1. from simstring.measure.base import BaseMeasure
  2. from sys import maxsize
  3. import math
  4. class OverlapMeasure(BaseMeasure):
  5. def min_feature_size(self, query_size, alpha):
  6. return 1
  7. def max_feature_size(self, query_size, alpha):
  8. return maxsize
  9. def minimum_common_feature_count(self, query_size, y_size, alpha):
  10. return int(math.ceil(alpha * min(query_size, y_size)))
  11. def similarity(self, X, Y):
  12. return min(len(set(X)), len(set(Y)))
  13. def patch_search(self, query_string, alpha):
  14. features = self.feature_extractor.features(query_string)
  15. min_feature_size = self.measure.min_feature_size(len(features), alpha)
  16. max_feature_size = self.measure.max_feature_size(len(features), alpha)
  17. # -- PATCH START
  18. max_feature_size = min(max_feature_size, self.db.max_feature_size())
  19. # -- PATCH END
  20. results = []
  21. for candidate_feature_size in range(min_feature_size, max_feature_size + 1):
  22. tau = self._Searcher__min_overlap(len(features), candidate_feature_size, alpha)
  23. results.extend(self._Searcher__overlap_join(features, tau, candidate_feature_size))
  24. return results
  25. from simstring.searcher import Searcher
  26. Searcher.search = patch_search