# -*- coding: utf-8 -*- """Helpers for deduplicating ranked retrieval results by document content.""" import re _WHITESPACE_RE = re.compile(r"\s+") def normalized_content(text): """Normalize content for exact duplicate detection.""" if text is None: return "" return _WHITESPACE_RE.sub(" ", str(text).strip()) def content_dedup_key(doc): """Build a stable dedup key, preferring normalized page content.""" content = normalized_content(getattr(doc, "page_content", "")) if content: return ("content", content) return ("object", id(doc)) def deduplicate_ranked_results(results, k=None): """ Deduplicate ``(doc, score)`` retrieval results by content. The highest scoring duplicate wins. Ties keep the earlier result. Final output is sorted by score descending, with original order as the tie-breaker. """ if k is not None and k <= 0: return [] best_by_key = {} for order, (doc, score) in enumerate(results or []): key = content_dedup_key(doc) score = float(score) current = best_by_key.get(key) if current is None or score > current[2]: best_by_key[key] = (order, doc, score) ranked = sorted(best_by_key.values(), key=lambda item: (-item[2], item[0])) deduped = [(doc, score) for _, doc, score in ranked] if k is None: return deduped return deduped[:k]