debug_bm25_html.py 1.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. # -*- coding: utf-8 -*-
  2. """调试 BM25 HTML Tree 检索"""
  3. import sys
  4. import os
  5. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  6. from bdirag.rag_methods.bm25_html_tree_rag import BM25HTMLTreeRAG
  7. # 创建测试 HTML
  8. html = """
  9. <html>
  10. <body>
  11. <h1>政府采购中标公告</h1>
  12. <table>
  13. <tr><td>采购人</td><td>XX市财政局</td></tr>
  14. <tr><td>中标人</td><td>XX科技有限公司</td></tr>
  15. <tr><td>中标金额</td><td>50万元</td></tr>
  16. </table>
  17. </body>
  18. </html>
  19. """
  20. print("创建 BM25HTMLTreeRAG 实例...")
  21. rag = BM25HTMLTreeRAG()
  22. print("\n构建索引...")
  23. rag.build_index(html)
  24. print(f"\n索引了 {len(rag.all_nodes)} 个节点")
  25. print("\n测试查询: '采购人'")
  26. results = rag.query("采购人", k=3)
  27. print(f"返回 {len(results)} 个结果")
  28. for i, (doc, score) in enumerate(results, 1):
  29. print(f"\n结果 {i} (score: {score:.3f}):")
  30. print(doc.page_content[:200])
  31. print("\n" + "=" * 80)
  32. print("\n测试查询: '中标人'")
  33. results = rag.query("中标人", k=3)
  34. print(f"返回 {len(results)} 个结果")
  35. for i, (doc, score) in enumerate(results, 1):
  36. print(f"\n结果 {i} (score: {score:.3f}):")
  37. print(doc.page_content[:200])