# -*- coding: utf-8 -*- """ 直接测试各种 RAG 方法(不通过 BidiRag 封装) """ import sys import os import time sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from bdirag.document_processor import Document from bdirag.rag_methods.bm25_rag import BM25RAG from bdirag.rag_methods.tfidf_rag import TFIDFRAG from bdirag.rag_methods.keyword_rag import KeywordRAG from bdirag.rag_methods.bm25_html_tree_rag import BM25HTMLTreeRAG TEST_DOCS = [ Document(page_content="""XX市第一人民医院医疗设备招标公告 项目名称:XX市第一人民医院彩色多普勒超声诊断仪采购项目 项目编号:XX-ZB-2024-001 预算金额:500万元 采购内容:彩色多普勒超声诊断仪 1台 投标人资格要求: 1. 具有独立承担民事责任的能力 2. 具有有效的医疗器械经营许可证 3. 近三年内无不良经营记录 4. 投标保证金:人民币5万元整 技术需求: 1. 彩色多普勒超声诊断仪技术参数 - 探头配置:腹部凸阵探头、高频线阵探头、心脏相控阵探头 - 显示屏:≥19英寸高清液晶显示器 - 质保期:整机质保三年 2. 交货时间:合同签订后60天内交货 3. 交货地点:XX市第一人民医院设备科 评标方法:采用综合评分法 - 技术部分:60分 - 商务部分:30分 - 价格部分:10分 付款方式:合同签订后支付30%,交货验收合格后支付65%,质保期满后支付5%"""), Document(page_content="""XX市智慧交通系统建设项目招标公告 项目名称:XX市智慧交通系统建设项目 项目编号:XX-ZB-2024-002 招标人:XX市交通运输局 预算金额:5000万元 项目内容: 1. 交通信号控制系统 2. 视频监控系统 3. 交通流量监测系统 4. 数据分析平台 资质要求: 1. 电子与智能化工程专业承包二级以上资质 2. 近三年至少完成2个类似项目业绩 评标方法:综合评分法 - 技术部分:60分 - 商务部分:40分 交货时间:合同签订后180天内 质保期:3年""") ] def test_bm25(): """测试 BM25""" print("\n" + "=" * 80) print("测试 BM25") print("=" * 80) rag = BM25RAG() rag.index_documents(TEST_DOCS) t0 = time.time() results = rag.retrieve("预算金额", k=3) retrieve_time = time.time() - t0 relevant = sum(1 for doc, _ in results if "预算" in doc.page_content) precision = relevant / len(results) if results else 0 print("召回数量: {}".format(len(results))) print("检索时间: {:.4f}s".format(retrieve_time)) print("精确度: {:.1%}".format(precision)) if results: print("\n结果预览:") for i, (doc, score) in enumerate(results[:2], 1): preview = doc.page_content[:80].replace("\n", " ") print(" [{}] Score={:.4f} | {}".format(i, score, preview)) def test_tfidf(): """测试 TF-IDF""" print("\n" + "=" * 80) print("测试 TF-IDF") print("=" * 80) rag = TFIDFRAG() rag.index_documents(TEST_DOCS) t0 = time.time() results = rag.retrieve("预算金额", k=3) retrieve_time = time.time() - t0 relevant = sum(1 for doc, _ in results if "预算" in doc.page_content) precision = relevant / len(results) if results else 0 print("召回数量: {}".format(len(results))) print("检索时间: {:.4f}s".format(retrieve_time)) print("精确度: {:.1%}".format(precision)) if results: print("\n结果预览:") for i, (doc, score) in enumerate(results[:2], 1): preview = doc.page_content[:80].replace("\n", " ") print(" [{}] Score={:.4f} | {}".format(i, score, preview)) def test_keyword(): """测试 Keyword""" print("\n" + "=" * 80) print("测试 Keyword") print("=" * 80) rag = KeywordRAG() rag.index_documents(TEST_DOCS) t0 = time.time() results = rag.retrieve("预算", k=3) retrieve_time = time.time() - t0 relevant = sum(1 for doc, _ in results if "预算" in doc.page_content) precision = relevant / len(results) if results else 0 print("召回数量: {}".format(len(results))) print("检索时间: {:.4f}s".format(retrieve_time)) print("精确度: {:.1%}".format(precision)) if results: print("\n结果预览:") for i, (doc, score) in enumerate(results[:2], 1): preview = doc.page_content[:80].replace("\n", " ") print(" [{}] Score={:.4f} | {}".format(i, score, preview)) def test_bm25_html_tree(): """测试 BM25 HTML Tree""" print("\n" + "=" * 80) print("测试 BM25 HTML Tree") print("=" * 80) html_content = "\n".join([doc.page_content for doc in TEST_DOCS]) rag = BM25HTMLTreeRAG() rag.build_index(html_content) t0 = time.time() results = rag.query("预算金额", k=3) retrieve_time = time.time() - t0 relevant = sum(1 for doc, _ in results if "预算" in doc.page_content) precision = relevant / len(results) if results else 0 print("召回数量: {}".format(len(results))) print("检索时间: {:.4f}s".format(retrieve_time)) print("精确度: {:.1%}".format(precision)) if results: print("\n结果预览:") for i, (doc, score) in enumerate(results[:2], 1): path = doc.metadata.get("path", "") preview = doc.page_content[:80].replace("\n", " ") print(" [{}] Score={:.4f} | Path: {}".format(i, score, path[:60])) print(" Content: {}...".format(preview)) if __name__ == "__main__": print("=" * 80) print("RAG 方法直接测试") print("=" * 80) try: test_bm25() except Exception as e: print("BM25 失败: " + str(e)) try: test_tfidf() except Exception as e: print("TF-IDF 失败: " + str(e)) try: test_keyword() except Exception as e: print("Keyword 失败: " + str(e)) try: test_bm25_html_tree() except Exception as e: print("BM25 HTML Tree 失败: " + str(e)) print("\n\n" + "=" * 80) print("测试完成!") print("=" * 80)