dedup.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. # -*- coding: utf-8 -*-
  2. """Helpers for deduplicating ranked retrieval results by document content."""
  3. import re
  4. _WHITESPACE_RE = re.compile(r"\s+")
  5. def normalized_content(text):
  6. """Normalize content for exact duplicate detection."""
  7. if text is None:
  8. return ""
  9. return _WHITESPACE_RE.sub(" ", str(text).strip())
  10. def content_dedup_key(doc):
  11. """Build a stable dedup key, preferring normalized page content."""
  12. content = normalized_content(getattr(doc, "page_content", ""))
  13. if content:
  14. return ("content", content)
  15. return ("object", id(doc))
  16. def deduplicate_ranked_results(results, k=None):
  17. """
  18. Deduplicate ``(doc, score)`` retrieval results by content.
  19. The highest scoring duplicate wins. Ties keep the earlier result. Final
  20. output is sorted by score descending, with original order as the tie-breaker.
  21. """
  22. if k is not None and k <= 0:
  23. return []
  24. best_by_key = {}
  25. for order, (doc, score) in enumerate(results or []):
  26. key = content_dedup_key(doc)
  27. score = float(score)
  28. current = best_by_key.get(key)
  29. if current is None or score > current[2]:
  30. best_by_key[key] = (order, doc, score)
  31. ranked = sorted(best_by_key.values(), key=lambda item: (-item[2], item[0]))
  32. deduped = [(doc, score) for _, doc, score in ranked]
  33. if k is None:
  34. return deduped
  35. return deduped[:k]