""" Complete benchmark script for comparing all RAG methods This script demonstrates various RAG methods and compares their performance """ import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from openai import OpenAI from FlagEmbedding import FlagReranker from bdirag.document_processor import Document, DocumentProcessor from bdirag.embedding_models import SentenceTransformerEmbedding from bdirag.vector_stores import FAISSStore from bdirag.rag_methods import ( NaiveRAG, RerankRAG, HybridSearchRAG, MultiQueryRAG, HyDERAG, SelfRAG, CorrectiveRAG, FLARERAG, RAPTORRAG, BidFieldExtractionRAG, TableAwareRAG, EnsembleRAG, GraphRAG, StepBackRAG, ContextualCompressionRAG, BM25RAG, TFIDFRAG, KeywordRAG, ) from bdirag.benchmark import RAGBenchmark from examples.sample_data import SAMPLE_BIDDING_DOCS def setup_rag_methods(embedding_model, vector_store, llm_client, llm_model="gpt-4o"): """Initialize all RAG methods with shared components""" rerank_model = FlagReranker("BAAI/bge-reranker-large", use_fp16=True) methods = {} # Basic RAG Methods methods["NaiveRAG"] = NaiveRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, ) methods["RerankRAG"] = RerankRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, rerank_model=rerank_model, rerank_top_k=5, ) methods["HybridSearchRAG"] = HybridSearchRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, semantic_weight=0.5, ) # Advanced RAG Methods methods["MultiQueryRAG"] = MultiQueryRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, num_queries=3, ) methods["HyDERAG"] = HyDERAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, num_hypotheses=3, ) methods["SelfRAG"] = SelfRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, relevance_threshold=0.5, support_threshold=0.5, ) methods["CorrectiveRAG"] = CorrectiveRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, correctness_threshold=0.6, ) methods["FLARERAG"] = FLARERAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, max_iterations=3, ) methods["RAPTORRAG"] = RAPTORRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, max_clusters=10, summary_length=256, num_tree_levels=2, ) methods["StepBackRAG"] = StepBackRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, ) methods["ContextualCompressionRAG"] = ContextualCompressionRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, compression_llm=llm_client, llm_model=llm_model, ) methods["EnsembleRAG"] = EnsembleRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, methods=["naive", "hybrid", "multi_query"], ) # Bidding-specific RAG Methods methods["BidFieldExtractionRAG"] = BidFieldExtractionRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, ) methods["TableAwareRAG"] = TableAwareRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, ) methods["GraphRAG"] = GraphRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, ) # Keyword-based RAG Methods (BM25 / TF-IDF) methods["BM25RAG"] = BM25RAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, ) methods["TFIDFRAG"] = TFIDFRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, ) methods["KeywordRAG_BM25"] = KeywordRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, search_method="bm25", ) methods["KeywordRAG_TFIDF"] = KeywordRAG( embedding_model=embedding_model, vector_store=vector_store, llm_client=llm_client, llm_model=llm_model, search_method="tfidf", ) return methods def main(): print("=" * 60) print("BidiRAG - RAG Methods Benchmark for Bidding Domain") print("=" * 60) # Configuration LLM_API_KEY = os.getenv("OPENAI_API_KEY", "your-api-key-here") LLM_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1") LLM_MODEL = os.getenv("LLM_MODEL", "gpt-4o") EMBEDDING_MODEL_NAME = os.getenv("EMBEDDING_MODEL", "BAAI/bge-large-zh-v1.5") # Step 1: Initialize embedding model print("\n[1/5] Loading embedding model...") embedding_model = SentenceTransformerEmbedding( model_name=EMBEDDING_MODEL_NAME, device="cpu" ) print(" Embedding dimension: {0}.format(embedding_model.dimension)") # Step 2: Initialize vector store print("\n[2/5] Initializing vector store...") vector_store = FAISSStore(embedding_model=embedding_model) # Step 3: Process documents print("\n[3/5] Processing sample bidding documents...") documents = [ Document(page_content=doc["content"], metadata={"title": doc["title"], "source": doc["title"]}) for doc in SAMPLE_BIDDING_DOCS ] print(" Loaded {0} documents.format(len(documents))") # Step 4: Index documents print("\n[4/5] Indexing documents...") for method_name, method in setup_rag_methods(embedding_model, vector_store, None, LLM_MODEL).items(): if method_name == "RAPTORRAG": method.build_tree(documents) elif method_name == "GraphRAG": method.build_graph(documents) else: method.index_documents(documents) print(f" Indexing complete") # Step 5: Initialize LLM client print("\n[5/5] Initializing LLM client...") llm_client = OpenAI(api_key=LLM_API_KEY, base_url=LLM_BASE_URL) # Setup RAG methods print("\nSetting up RAG methods...") methods = setup_rag_methods(embedding_model, vector_store, llm_client, LLM_MODEL) # Define test queries test_queries = [ "XX City Smart Transportation Project budget and deadline?", "What are the qualification requirements for the hospital equipment procurement?", "What is the warranty period for the university network project?", "List all bid bond amounts in the announcements", "What evaluation methods are used across different projects?", "XX Road construction project payment terms?", "Environmental monitoring system equipment list?", "Which projects require Grade I qualification?", ] print("\nRunning benchmark with {0} queries across {1} methods....format(len(test_queries), len(methods))") print("=" * 60) # Run benchmark benchmark = RAGBenchmark() # Run methods one by one to avoid overwhelming the LLM API results_summary = {} for method_name, method in methods.items(): print("\n{0}.format('=' * 60)") print("Testing: {0}.format(method_name)") print("{0}.format('=' * 60)") method_results = [] for i, query in enumerate(test_queries): print("\n Query {0}/{1}: {2}.format(i + 1, len(test_queries), query)") try: result = method.query(query, k=5) method_results.append(result) print(" Answer: {0}....format(result.answer[:100])") print(" Total Latency: {0}s.format(result.latency_total:.3f)") print(" Retrieval: {0}s | Generation: {1}s.format(result.latency_retrieval:.3f, result.latency_generation:.3f)") except Exception as e: print(" ERROR: {0}.format(e)") results_summary[method_name] = method_results # Generate comparison report print("\n\n" + "=" * 60) print("BENCHMARK RESULTS") print("=" * 60) # Create metrics manually from results from bdirag.benchmark import BenchmarkMetrics, BenchmarkResult import numpy as np import datetime metrics = [] for method_name, results in results_summary.items(): if not results: continue total_latencies = [r.latency_total for r in results] retrieval_latencies = [r.latency_retrieval for r in results] generation_latencies = [r.latency_generation for r in results] num_docs = [len(r.retrieved_docs) for r in results] metric = BenchmarkMetrics( method_name=method_name, avg_latency_total=np.mean(total_latencies), avg_latency_retrieval=np.mean(retrieval_latencies), avg_latency_generation=np.mean(generation_latencies), avg_docs_retrieved=np.mean(num_docs), total_queries=len(results), latency_std=np.std(total_latencies), retrieval_std=np.std(retrieval_latencies), generation_std=np.std(generation_latencies), min_latency=np.min(total_latencies), max_latency=np.max(total_latencies), p50_latency=np.percentile(total_latencies, 50), p95_latency=np.percentile(total_latencies, 95), ) metrics.append(metric) # Print comparison table print("\nLatency Comparison (sorted by average total latency):") print("-" * 100) print("{0} {1} {2} {3} {4} {5} {6} {7}.format('Method':<30, 'Avg Total':>10, 'Avg Retri':>10, 'Avg Gener':>10, 'P50':>8, 'P95':>8, 'Min':>8, 'Max':>8)") print("-" * 100) for m in sorted(metrics, key=lambda x: x.avg_latency_total): print( "{0} {1} {2} .format(m.method_name:<30, m.avg_latency_total:>10.3f, m.avg_latency_retrieval:>10.3f)" "{0} {1} {2} .format(m.avg_latency_generation:>10.3f, m.p50_latency:>8.3f, m.p95_latency:>8.3f)" "{0} {1}.format(m.min_latency:>8.3f, m.max_latency:>8.3f)" ) print("-" * 100) print("\nFastest Method: {0}.format(min(metrics, key=lambda x: x.avg_latency_total).method_name)") print("Most Stable: {0}.format(min(metrics, key=lambda x: x.latency_std).method_name)") # Save results output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "output") os.makedirs(output_dir, exist_ok=True) benchmark_result = BenchmarkResult( metrics=metrics, detailed_results={}, timestamp=datetime.datetime.now().isoformat(), ) benchmark_result.save(os.path.join(output_dir, "benchmark_results.json")) benchmark.generate_report(benchmark_result, os.path.join(output_dir, "benchmark_report.md")) try: benchmark.plot_comparison( metrics, save_path=os.path.join(output_dir, "benchmark_comparison.png"), show=False, ) except Exception as e: print("\nNote: Could not generate plot: {0}.format(e)") print("\nResults saved to {0}.format(output_dir)") print("\nBenchmark complete!") if __name__ == "__main__": main()