extract_bid_info.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. # -*- coding: utf-8 -*-
  2. """
  3. 从招投标公告中提取招标人、中标人等关键信息
  4. 这个示例展示了如何使用 BidiRag 召回 HTML 公告中的内容片段,
  5. 并从中提取关键信息。
  6. """
  7. import sys
  8. import os
  9. import re
  10. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  11. from bdirag.bidi_rag import BidiRag
  12. def extract_key_info(html_content):
  13. """从 HTML 内容中提取关键信息"""
  14. from bs4 import BeautifulSoup
  15. info = {}
  16. try:
  17. # 尝试使用 lxml,如果失败则使用内置的 html.parser
  18. try:
  19. soup = BeautifulSoup(html_content, 'lxml')
  20. except:
  21. soup = BeautifulSoup(html_content, 'html.parser')
  22. # 查找所有表格行
  23. for tr in soup.find_all('tr'):
  24. cells = tr.find_all(['td', 'th'])
  25. if len(cells) >= 2:
  26. key = cells[0].get_text(strip=True)
  27. value = cells[1].get_text(strip=True)
  28. # 匹配关键字段
  29. if '招标人' in key:
  30. info['招标人'] = value
  31. elif '采购人' in key and '意向' not in key:
  32. info['采购人'] = value
  33. elif '采购意向单位' in key:
  34. info['采购意向单位'] = value
  35. elif '中标人' in key:
  36. info['中标人'] = value
  37. elif '项目名称' in key or '采购项目名称' in key:
  38. info['项目名称'] = value
  39. elif '中标金额' in key:
  40. info['中标金额'] = value
  41. elif '预算金额' in key:
  42. info['预算金额'] = value
  43. except Exception as e:
  44. # 如果解析失败,使用正则表达式作为后备
  45. patterns = {
  46. '招标人': r'<td>招标人</td>\s*<td>([^<]+)</td>',
  47. '采购人': r'<td>采购人</td>\s*<td>([^<]+)</td>',
  48. '采购意向单位': r'<td>采购意向单位</td>\s*<td>([^<]+)</td>',
  49. '中标人': r'<td>中标人</td>\s*<td>([^<]+)</td>',
  50. '项目名称': r'<td>项目名称</td>\s*<td>([^<]+)</td>',
  51. '中标金额': r'<td>中标金额</td>\s*<td>([^<]+)</td>',
  52. '预算金额': r'<td>预算金额</td>\s*<td>([^<]+)</td>',
  53. }
  54. for key, pattern in patterns.items():
  55. match = re.search(pattern, html_content)
  56. if match:
  57. info[key] = match.group(1).strip()
  58. return info
  59. def demo_extract_bid_info():
  60. """演示从招投标公告中提取信息"""
  61. print("=" * 80)
  62. print("招投标公告信息提取示例")
  63. print("=" * 80)
  64. # 1. 初始化 BidiRag
  65. print("\n[步骤 1] 初始化 BidiRag (使用 bm25_html_tree 方法)...")
  66. rag = BidiRag(rag_method='bm25_html_tree')
  67. # 2. 添加公告数据
  68. print("\n[步骤 2] 添加招投标公告...")
  69. announcements = [
  70. """<html>
  71. <body>
  72. <h1>政府采购意向公告</h1>
  73. <table>
  74. <tr><td>采购意向单位</td><td>大连长兴岛经济技术开发区交流岛街道办事处</td></tr>
  75. <tr><td>采购项目名称</td><td>交流岛滨海路夜间出行照明提升工程</td></tr>
  76. <tr><td>预算金额</td><td>147.060000万元</td></tr>
  77. <tr><td>采购品目</td><td>路灯照明工程</td></tr>
  78. <tr><td>采购需求概况</td><td>在滨海路安装太阳能路灯200盏(单排)</td></tr>
  79. <tr><td>预计采购时间</td><td>2026-05</td></tr>
  80. </table>
  81. </body>
  82. </html>""",
  83. """<html>
  84. <body>
  85. <h1>中标公告</h1>
  86. <table>
  87. <tr><td>采购人</td><td>XX市财政局</td></tr>
  88. <tr><td>项目名称</td><td>办公设备采购项目</td></tr>
  89. <tr><td>中标人</td><td>XX办公设备有限公司</td></tr>
  90. <tr><td>中标金额</td><td>50万元</td></tr>
  91. <tr><td>采购内容</td><td>电脑、打印机、复印机等办公设备</td></tr>
  92. </table>
  93. </body>
  94. </html>""",
  95. """<html>
  96. <body>
  97. <h1>招标公告</h1>
  98. <table>
  99. <tr><td>招标人</td><td>XX市交通运输局</td></tr>
  100. <tr><td>项目名称</td><td>智慧交通系统建设项目</td></tr>
  101. <tr><td>项目预算</td><td>5000万元</td></tr>
  102. <tr><td>招标内容</td><td>交通信号控制系统、视频监控系统</td></tr>
  103. <tr><td>投标截止时间</td><td>2024-06-15</td></tr>
  104. </table>
  105. </body>
  106. </html>""",
  107. """<html>
  108. <body>
  109. <h1>中标结果公告</h1>
  110. <table>
  111. <tr><td>招标人</td><td>XX市城市建设投资集团</td></tr>
  112. <tr><td>中标人</td><td>XX建设工程有限公司</td></tr>
  113. <tr><td>项目名称</td><td>道路建设项目</td></tr>
  114. <tr><td>中标金额</td><td>18500万元</td></tr>
  115. <tr><td>建设内容</td><td>道路路基、路面、桥梁工程</td></tr>
  116. </table>
  117. </body>
  118. </html>"""
  119. ]
  120. rag.add_texts(announcements)
  121. print(f"✓ 已添加 {rag.get_document_count()} 个公告")
  122. # 3. 检索招标人信息
  123. print("\n" + "=" * 80)
  124. print("[示例 1] 检索所有包含'招标人'或'采购人'的公告")
  125. print("=" * 80)
  126. # 使用 OR 逻辑:分别搜索
  127. for keyword in ['招标人', '采购人', '采购意向单位']:
  128. print(f"\n搜索关键词: '{keyword}'")
  129. results = rag.search_keywords([keyword], top_k=10)
  130. if results:
  131. print(f"找到 {len(results)} 个相关公告:\n")
  132. for i, doc in enumerate(results, 1):
  133. print(f"--- 公告 {i} ---")
  134. info = extract_key_info(doc.page_content)
  135. for key, value in info.items():
  136. print(f" {key}: {value}")
  137. print()
  138. else:
  139. print("未找到相关结果\n")
  140. # 4. 检索中标人信息
  141. print("\n" + "=" * 80)
  142. print("[示例 2] 检索所有包含'中标人'的公告")
  143. print("=" * 80)
  144. results = rag.search_keywords(['中标人'], top_k=10)
  145. if results:
  146. print(f"\n找到 {len(results)} 个中标公告:\n")
  147. for i, doc in enumerate(results, 1):
  148. print(f"--- 中标公告 {i} ---")
  149. info = extract_key_info(doc.page_content)
  150. for key, value in info.items():
  151. print(f" {key}: {value}")
  152. print()
  153. # 5. 同时包含招标人和中标人的公告
  154. print("\n" + "=" * 80)
  155. print("[示例 3] 检索同时包含'招标人'和'中标人'的公告(完整项目信息)")
  156. print("=" * 80)
  157. results = rag.search_keywords(['招标人', '中标人'], top_k=10)
  158. if results:
  159. print(f"\n找到 {len(results)} 个完整项目公告:\n")
  160. for i, doc in enumerate(results, 1):
  161. print(f"--- 项目 {i} ---")
  162. info = extract_key_info(doc.page_content)
  163. # 分类显示
  164. purchaser = info.get('招标人') or info.get('采购人')
  165. winner = info.get('中标人')
  166. project = info.get('项目名称')
  167. amount = info.get('中标金额') or info.get('预算金额')
  168. print(f" 招标人: {purchaser}")
  169. print(f" 中标人: {winner}")
  170. print(f" 项目: {project}")
  171. print(f" 金额: {amount}")
  172. print()
  173. # 6. 检索特定类型项目
  174. print("\n" + "=" * 80)
  175. print("[示例 4] 检索特定项目类型")
  176. print("=" * 80)
  177. search_terms = [
  178. (['路灯', '照明'], "照明工程"),
  179. (['交通', '系统'], "交通系统"),
  180. (['道路', '建设'], "道路建设"),
  181. (['办公', '设备'], "办公设备"),
  182. ]
  183. for keywords, category in search_terms:
  184. print(f"\n搜索 '{category}' 相关项目:")
  185. results = rag.search_keywords(keywords, top_k=5)
  186. if results:
  187. for i, doc in enumerate(results, 1):
  188. info = extract_key_info(doc.page_content)
  189. purchaser = info.get('招标人') or info.get('采购人') or info.get('采购意向单位')
  190. project = info.get('项目名称')
  191. amount = info.get('中标金额') or info.get('预算金额')
  192. print(f" {i}. {project}")
  193. print(f" 招标方: {purchaser}")
  194. print(f" 金额: {amount}")
  195. else:
  196. print(" 未找到相关项目")
  197. # 7. 总结
  198. print("\n" + "=" * 80)
  199. print("提取结果总结")
  200. print("=" * 80)
  201. # 统计所有公告
  202. all_results = rag.search_keywords(['招标人', '采购人', '中标人'], top_k=100)
  203. purchasers = set()
  204. winners = set()
  205. total_amount = 0
  206. for doc in all_results:
  207. info = extract_key_info(doc.page_content)
  208. purchaser = info.get('招标人') or info.get('采购人') or info.get('采购意向单位')
  209. if purchaser:
  210. purchasers.add(purchaser)
  211. winner = info.get('中标人')
  212. if winner:
  213. winners.add(winner)
  214. amount_str = info.get('中标金额') or info.get('预算金额') or ''
  215. # 提取金额数字(简单处理)
  216. amount_match = re.search(r'(\d+(?:\.\d+)?)万', amount_str)
  217. if amount_match:
  218. total_amount += float(amount_match.group(1))
  219. print(f"\n共处理 {len(all_results)} 个公告")
  220. print(f"涉及招标方 {len(purchasers)} 个:")
  221. for p in purchasers:
  222. print(f" - {p}")
  223. print(f"\n涉及中标方 {len(winners)} 个:")
  224. for w in winners:
  225. print(f" - {w}")
  226. print(f"\n项目总金额: 约 {total_amount:.2f} 万元")
  227. print("\n" + "=" * 80)
  228. print("示例完成!")
  229. print("=" * 80)
  230. if __name__ == "__main__":
  231. demo_extract_bid_info()