| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546 |
- # -*- coding: utf-8 -*-
- """Helpers for deduplicating ranked retrieval results by document content."""
- import re
- _WHITESPACE_RE = re.compile(r"\s+")
- def normalized_content(text):
- """Normalize content for exact duplicate detection."""
- if text is None:
- return ""
- return _WHITESPACE_RE.sub(" ", str(text).strip())
- def content_dedup_key(doc):
- """Build a stable dedup key, preferring normalized page content."""
- content = normalized_content(getattr(doc, "page_content", ""))
- if content:
- return ("content", content)
- return ("object", id(doc))
- def deduplicate_ranked_results(results, k=None):
- """
- Deduplicate ``(doc, score)`` retrieval results by content.
- The highest scoring duplicate wins. Ties keep the earlier result. Final
- output is sorted by score descending, with original order as the tie-breaker.
- """
- if k is not None and k <= 0:
- return []
- best_by_key = {}
- for order, (doc, score) in enumerate(results or []):
- key = content_dedup_key(doc)
- score = float(score)
- current = best_by_key.get(key)
- if current is None or score > current[2]:
- best_by_key[key] = (order, doc, score)
- ranked = sorted(best_by_key.values(), key=lambda item: (-item[2], item[0]))
- deduped = [(doc, score) for _, doc, score in ranked]
- if k is None:
- return deduped
- return deduped[:k]
|