| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- # -*- coding: utf-8 -*-
- """Focused tests for content-level retrieval deduplication."""
- import os
- import sys
- ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
- if ROOT_DIR not in sys.path:
- sys.path.insert(0, ROOT_DIR)
- from bdirag.document_processor import Document
- from bdirag.rag_methods.bm25_html_tree_rag import BM25HTMLTreeRAG
- from bdirag.rag_methods.bm25_rag import BM25RAG
- from bdirag.rag_methods.dedup import deduplicate_ranked_results
- from bdirag.rag_methods.ensemble_rag import EnsembleRAG
- from examples.rag_test_utils import FakeEmbedding, SimpleVectorStore, install_rank_bm25_fallback
- def test_deduplicate_ranked_results_keeps_highest_score():
- low = Document(page_content=" Duplicate content ", metadata={"source": "low"})
- high = Document(page_content="Duplicate content", metadata={"source": "high"})
- other = Document(page_content="Other content", metadata={"source": "other"})
- results = deduplicate_ranked_results([(low, 0.1), (other, 0.2), (high, 0.9)], k=10)
- assert len(results) == 2
- assert results[0][0].metadata["source"] == "high"
- assert results[0][1] == 0.9
- def test_deduplicate_ranked_results_keeps_first_on_score_tie():
- first = Document(page_content="Same content", metadata={"source": "first"})
- second = Document(page_content=" Same content ", metadata={"source": "second"})
- results = deduplicate_ranked_results([(first, 0.5), (second, 0.5)], k=10)
- assert len(results) == 1
- assert results[0][0].metadata["source"] == "first"
- def test_bm25_retrieve_deduplicates_equal_content_documents():
- install_rank_bm25_fallback()
- rag = BM25RAG()
- docs = [
- Document(page_content="alpha beta project budget", metadata={"source": "a"}),
- Document(page_content="alpha beta project budget", metadata={"source": "b"}),
- Document(page_content="alpha delivery schedule", metadata={"source": "c"}),
- Document(page_content="gamma warranty terms", metadata={"source": "d"}),
- Document(page_content="delta payment terms", metadata={"source": "e"}),
- ]
- rag.index_documents(docs)
- results = rag.retrieve("alpha beta", k=3)
- contents = [doc.page_content for doc, _ in results]
- assert contents.count("alpha beta project budget") == 1
- assert len(contents) == len(set(contents))
- def test_ensemble_retrieve_merges_duplicate_content_from_distinct_objects():
- docs = [
- Document(page_content="alpha beta project budget", metadata={"source": "a"}),
- Document(page_content="alpha beta project budget", metadata={"source": "b"}),
- Document(page_content="alpha delivery schedule", metadata={"source": "c"}),
- ]
- rag = EnsembleRAG(embedding_model=FakeEmbedding(), vector_store=SimpleVectorStore())
- embeddings = rag.embedding_model.embed_documents([doc.page_content for doc in docs])
- rag.vector_store.add_documents(docs, embeddings)
- results = rag.retrieve("alpha beta", k=3)
- contents = [doc.page_content for doc, _ in results]
- assert contents.count("alpha beta project budget") == 1
- assert len(contents) == len(set(contents))
- def test_html_tree_query_deduplicates_formatted_documents():
- rag = BM25HTMLTreeRAG()
- node_a = {"type": "p", "sentence_title_text": "A"}
- node_b = {"type": "p", "sentence_title_text": "B"}
- rag.retrieve_subtrees = lambda query, k: [
- (node_a, 0.7, "Repeated subtree text"),
- (node_b, 0.9, " Repeated subtree text "),
- ]
- rag.get_node_path = lambda node: node["sentence_title_text"]
- results = rag.query("repeated", k=5)
- assert len(results) == 1
- assert results[0][0].metadata["title"] == "B"
- assert results[0][1] == 0.9
|