# -*- coding: utf-8 -*- """Focused tests for content-level retrieval deduplication.""" import os import sys ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if ROOT_DIR not in sys.path: sys.path.insert(0, ROOT_DIR) from bdirag.document_processor import Document from bdirag.rag_methods.bm25_html_tree_rag import BM25HTMLTreeRAG from bdirag.rag_methods.bm25_rag import BM25RAG from bdirag.rag_methods.dedup import deduplicate_ranked_results from bdirag.rag_methods.ensemble_rag import EnsembleRAG from examples.rag_test_utils import FakeEmbedding, SimpleVectorStore, install_rank_bm25_fallback def test_deduplicate_ranked_results_keeps_highest_score(): low = Document(page_content=" Duplicate content ", metadata={"source": "low"}) high = Document(page_content="Duplicate content", metadata={"source": "high"}) other = Document(page_content="Other content", metadata={"source": "other"}) results = deduplicate_ranked_results([(low, 0.1), (other, 0.2), (high, 0.9)], k=10) assert len(results) == 2 assert results[0][0].metadata["source"] == "high" assert results[0][1] == 0.9 def test_deduplicate_ranked_results_keeps_first_on_score_tie(): first = Document(page_content="Same content", metadata={"source": "first"}) second = Document(page_content=" Same content ", metadata={"source": "second"}) results = deduplicate_ranked_results([(first, 0.5), (second, 0.5)], k=10) assert len(results) == 1 assert results[0][0].metadata["source"] == "first" def test_bm25_retrieve_deduplicates_equal_content_documents(): install_rank_bm25_fallback() rag = BM25RAG() docs = [ Document(page_content="alpha beta project budget", metadata={"source": "a"}), Document(page_content="alpha beta project budget", metadata={"source": "b"}), Document(page_content="alpha delivery schedule", metadata={"source": "c"}), Document(page_content="gamma warranty terms", metadata={"source": "d"}), Document(page_content="delta payment terms", metadata={"source": "e"}), ] rag.index_documents(docs) results = rag.retrieve("alpha beta", k=3) contents = [doc.page_content for doc, _ in results] assert contents.count("alpha beta project budget") == 1 assert len(contents) == len(set(contents)) def test_ensemble_retrieve_merges_duplicate_content_from_distinct_objects(): docs = [ Document(page_content="alpha beta project budget", metadata={"source": "a"}), Document(page_content="alpha beta project budget", metadata={"source": "b"}), Document(page_content="alpha delivery schedule", metadata={"source": "c"}), ] rag = EnsembleRAG(embedding_model=FakeEmbedding(), vector_store=SimpleVectorStore()) embeddings = rag.embedding_model.embed_documents([doc.page_content for doc in docs]) rag.vector_store.add_documents(docs, embeddings) results = rag.retrieve("alpha beta", k=3) contents = [doc.page_content for doc, _ in results] assert contents.count("alpha beta project budget") == 1 assert len(contents) == len(set(contents)) def test_html_tree_query_deduplicates_formatted_documents(): rag = BM25HTMLTreeRAG() node_a = {"type": "p", "sentence_title_text": "A"} node_b = {"type": "p", "sentence_title_text": "B"} rag.retrieve_subtrees = lambda query, k: [ (node_a, 0.7, "Repeated subtree text"), (node_b, 0.9, " Repeated subtree text "), ] rag.get_node_path = lambda node: node["sentence_title_text"] results = rag.query("repeated", k=5) assert len(results) == 1 assert results[0][0].metadata["title"] == "B" assert results[0][1] == 0.9