| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282 |
- # -*- coding: utf-8 -*-
- """
- 从招投标公告中提取招标人、中标人等关键信息
- 这个示例展示了如何使用 BidiRag 召回 HTML 公告中的内容片段,
- 并从中提取关键信息。
- """
- import sys
- import os
- import re
- sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
- from bdirag.bidi_rag import BidiRag
- def extract_key_info(html_content):
- """从 HTML 内容中提取关键信息"""
- from bs4 import BeautifulSoup
-
- info = {}
-
- try:
- # 尝试使用 lxml,如果失败则使用内置的 html.parser
- try:
- soup = BeautifulSoup(html_content, 'lxml')
- except:
- soup = BeautifulSoup(html_content, 'html.parser')
-
- # 查找所有表格行
- for tr in soup.find_all('tr'):
- cells = tr.find_all(['td', 'th'])
- if len(cells) >= 2:
- key = cells[0].get_text(strip=True)
- value = cells[1].get_text(strip=True)
-
- # 匹配关键字段
- if '招标人' in key:
- info['招标人'] = value
- elif '采购人' in key and '意向' not in key:
- info['采购人'] = value
- elif '采购意向单位' in key:
- info['采购意向单位'] = value
- elif '中标人' in key:
- info['中标人'] = value
- elif '项目名称' in key or '采购项目名称' in key:
- info['项目名称'] = value
- elif '中标金额' in key:
- info['中标金额'] = value
- elif '预算金额' in key:
- info['预算金额'] = value
- except Exception as e:
- # 如果解析失败,使用正则表达式作为后备
- patterns = {
- '招标人': r'<td>招标人</td>\s*<td>([^<]+)</td>',
- '采购人': r'<td>采购人</td>\s*<td>([^<]+)</td>',
- '采购意向单位': r'<td>采购意向单位</td>\s*<td>([^<]+)</td>',
- '中标人': r'<td>中标人</td>\s*<td>([^<]+)</td>',
- '项目名称': r'<td>项目名称</td>\s*<td>([^<]+)</td>',
- '中标金额': r'<td>中标金额</td>\s*<td>([^<]+)</td>',
- '预算金额': r'<td>预算金额</td>\s*<td>([^<]+)</td>',
- }
-
- for key, pattern in patterns.items():
- match = re.search(pattern, html_content)
- if match:
- info[key] = match.group(1).strip()
-
- return info
- def demo_extract_bid_info():
- """演示从招投标公告中提取信息"""
-
- print("=" * 80)
- print("招投标公告信息提取示例")
- print("=" * 80)
-
- # 1. 初始化 BidiRag
- print("\n[步骤 1] 初始化 BidiRag (使用 bm25_html_tree 方法)...")
- rag = BidiRag(rag_method='bm25_html_tree')
-
- # 2. 添加公告数据
- print("\n[步骤 2] 添加招投标公告...")
-
- announcements = [
- """<html>
- <body>
- <h1>政府采购意向公告</h1>
- <table>
- <tr><td>采购意向单位</td><td>大连长兴岛经济技术开发区交流岛街道办事处</td></tr>
- <tr><td>采购项目名称</td><td>交流岛滨海路夜间出行照明提升工程</td></tr>
- <tr><td>预算金额</td><td>147.060000万元</td></tr>
- <tr><td>采购品目</td><td>路灯照明工程</td></tr>
- <tr><td>采购需求概况</td><td>在滨海路安装太阳能路灯200盏(单排)</td></tr>
- <tr><td>预计采购时间</td><td>2026-05</td></tr>
- </table>
- </body>
- </html>""",
-
- """<html>
- <body>
- <h1>中标公告</h1>
- <table>
- <tr><td>采购人</td><td>XX市财政局</td></tr>
- <tr><td>项目名称</td><td>办公设备采购项目</td></tr>
- <tr><td>中标人</td><td>XX办公设备有限公司</td></tr>
- <tr><td>中标金额</td><td>50万元</td></tr>
- <tr><td>采购内容</td><td>电脑、打印机、复印机等办公设备</td></tr>
- </table>
- </body>
- </html>""",
-
- """<html>
- <body>
- <h1>招标公告</h1>
- <table>
- <tr><td>招标人</td><td>XX市交通运输局</td></tr>
- <tr><td>项目名称</td><td>智慧交通系统建设项目</td></tr>
- <tr><td>项目预算</td><td>5000万元</td></tr>
- <tr><td>招标内容</td><td>交通信号控制系统、视频监控系统</td></tr>
- <tr><td>投标截止时间</td><td>2024-06-15</td></tr>
- </table>
- </body>
- </html>""",
-
- """<html>
- <body>
- <h1>中标结果公告</h1>
- <table>
- <tr><td>招标人</td><td>XX市城市建设投资集团</td></tr>
- <tr><td>中标人</td><td>XX建设工程有限公司</td></tr>
- <tr><td>项目名称</td><td>道路建设项目</td></tr>
- <tr><td>中标金额</td><td>18500万元</td></tr>
- <tr><td>建设内容</td><td>道路路基、路面、桥梁工程</td></tr>
- </table>
- </body>
- </html>"""
- ]
-
- rag.add_texts(announcements)
- print(f"✓ 已添加 {rag.get_document_count()} 个公告")
-
- # 3. 检索招标人信息
- print("\n" + "=" * 80)
- print("[示例 1] 检索所有包含'招标人'或'采购人'的公告")
- print("=" * 80)
-
- # 使用 OR 逻辑:分别搜索
- for keyword in ['招标人', '采购人', '采购意向单位']:
- print(f"\n搜索关键词: '{keyword}'")
- results = rag.search_keywords([keyword], top_k=10)
-
- if results:
- print(f"找到 {len(results)} 个相关公告:\n")
- for i, doc in enumerate(results, 1):
- print(f"--- 公告 {i} ---")
- info = extract_key_info(doc.page_content)
- for key, value in info.items():
- print(f" {key}: {value}")
- print()
- else:
- print("未找到相关结果\n")
-
- # 4. 检索中标人信息
- print("\n" + "=" * 80)
- print("[示例 2] 检索所有包含'中标人'的公告")
- print("=" * 80)
-
- results = rag.search_keywords(['中标人'], top_k=10)
-
- if results:
- print(f"\n找到 {len(results)} 个中标公告:\n")
- for i, doc in enumerate(results, 1):
- print(f"--- 中标公告 {i} ---")
- info = extract_key_info(doc.page_content)
- for key, value in info.items():
- print(f" {key}: {value}")
- print()
-
- # 5. 同时包含招标人和中标人的公告
- print("\n" + "=" * 80)
- print("[示例 3] 检索同时包含'招标人'和'中标人'的公告(完整项目信息)")
- print("=" * 80)
-
- results = rag.search_keywords(['招标人', '中标人'], top_k=10)
-
- if results:
- print(f"\n找到 {len(results)} 个完整项目公告:\n")
- for i, doc in enumerate(results, 1):
- print(f"--- 项目 {i} ---")
- info = extract_key_info(doc.page_content)
-
- # 分类显示
- purchaser = info.get('招标人') or info.get('采购人')
- winner = info.get('中标人')
- project = info.get('项目名称')
- amount = info.get('中标金额') or info.get('预算金额')
-
- print(f" 招标人: {purchaser}")
- print(f" 中标人: {winner}")
- print(f" 项目: {project}")
- print(f" 金额: {amount}")
- print()
-
- # 6. 检索特定类型项目
- print("\n" + "=" * 80)
- print("[示例 4] 检索特定项目类型")
- print("=" * 80)
-
- search_terms = [
- (['路灯', '照明'], "照明工程"),
- (['交通', '系统'], "交通系统"),
- (['道路', '建设'], "道路建设"),
- (['办公', '设备'], "办公设备"),
- ]
-
- for keywords, category in search_terms:
- print(f"\n搜索 '{category}' 相关项目:")
- results = rag.search_keywords(keywords, top_k=5)
-
- if results:
- for i, doc in enumerate(results, 1):
- info = extract_key_info(doc.page_content)
- purchaser = info.get('招标人') or info.get('采购人') or info.get('采购意向单位')
- project = info.get('项目名称')
- amount = info.get('中标金额') or info.get('预算金额')
-
- print(f" {i}. {project}")
- print(f" 招标方: {purchaser}")
- print(f" 金额: {amount}")
- else:
- print(" 未找到相关项目")
-
- # 7. 总结
- print("\n" + "=" * 80)
- print("提取结果总结")
- print("=" * 80)
-
- # 统计所有公告
- all_results = rag.search_keywords(['招标人', '采购人', '中标人'], top_k=100)
-
- purchasers = set()
- winners = set()
- total_amount = 0
-
- for doc in all_results:
- info = extract_key_info(doc.page_content)
-
- purchaser = info.get('招标人') or info.get('采购人') or info.get('采购意向单位')
- if purchaser:
- purchasers.add(purchaser)
-
- winner = info.get('中标人')
- if winner:
- winners.add(winner)
-
- amount_str = info.get('中标金额') or info.get('预算金额') or ''
- # 提取金额数字(简单处理)
- amount_match = re.search(r'(\d+(?:\.\d+)?)万', amount_str)
- if amount_match:
- total_amount += float(amount_match.group(1))
-
- print(f"\n共处理 {len(all_results)} 个公告")
- print(f"涉及招标方 {len(purchasers)} 个:")
- for p in purchasers:
- print(f" - {p}")
-
- print(f"\n涉及中标方 {len(winners)} 个:")
- for w in winners:
- print(f" - {w}")
-
- print(f"\n项目总金额: 约 {total_amount:.2f} 万元")
-
- print("\n" + "=" * 80)
- print("示例完成!")
- print("=" * 80)
- if __name__ == "__main__":
- demo_extract_bid_info()
|