extract.py 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783
  1. #coding:utf8
  2. '''
  3. Created on 2019年1月4日
  4. @author: User
  5. '''
  6. import os
  7. from bs4 import BeautifulSoup, Comment
  8. import copy
  9. import re
  10. import sys
  11. import os
  12. import codecs
  13. import requests
  14. import time
  15. from unicodedata import normalize
  16. _time1 = time.time()
  17. sys.path.append(os.path.abspath("../.."))
  18. from BiddingKG.dl.common.Utils import *
  19. import BiddingKG.dl.entityLink.entityLink as entityLink
  20. import BiddingKG.dl.interface.predictor as predictor
  21. import BiddingKG.dl.interface.Preprocessing as Preprocessing
  22. import BiddingKG.dl.interface.getAttributes as getAttributes
  23. import BiddingKG.dl.complaint.punish_predictor as punish_rule
  24. import json
  25. from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
  26. from BiddingKG.dl.ratio.re_ratio import extract_ratio
  27. from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_parameters, extract_sentence_list, extract_addr
  28. from BiddingKG.dl.interface.get_label_dic import get_all_label
  29. from BiddingKG.dl.channel.channel_bert import merge_channel
  30. from BiddingKG.dl.interface.kvtree_search import get_kvtree_value
  31. from BiddingKG.dl.interface.special_debt_extract import get_debt_info
  32. # 自定义jsonEncoder
  33. class MyEncoder(json.JSONEncoder):
  34. def default(self, obj):
  35. if isinstance(obj, np.ndarray):
  36. return obj.tolist()
  37. elif isinstance(obj, bytes):
  38. return str(obj, encoding='utf-8')
  39. elif isinstance(obj, (np.float_, np.float16, np.float32,
  40. np.float64)):
  41. return float(obj)
  42. elif isinstance(obj,str):
  43. return obj
  44. return json.JSONEncoder.default(self, obj)
  45. def get_login_web_set():
  46. file = os.path.join(os.path.dirname(__file__),"login_weblist.txt")
  47. list_web = []
  48. try:
  49. if os.path.exists(file):
  50. with open(file,"r",encoding="utf8") as f:
  51. while 1:
  52. line = f.readline()
  53. if not line:
  54. break
  55. line = line.strip()
  56. if line:
  57. list_web.append(line)
  58. except Exception as e:
  59. traceback.print_exc()
  60. _set = set(list_web)
  61. log("get_login_web_set length %d"%(len(_set)))
  62. return _set
  63. set_login_web = get_login_web_set()
  64. def extractCount(extract_dict,page_attachments,web_source_name):
  65. # time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  66. if len(extract_dict):
  67. _extract = extract_dict
  68. else:
  69. _extract = {}
  70. # print(_extract)
  71. dict_pack = _extract.get("prem",{})
  72. extract_count = 0
  73. list_code = _extract.get("code",[])
  74. word_count = _extract.get("word_count",{})
  75. if word_count.get("正文",0)>500:
  76. extract_count += 3
  77. if len(list_code)>0:
  78. project_code = list_code[0]
  79. else:
  80. project_code = ""
  81. project_name = _extract.get("name","")
  82. bidding_budget = ""
  83. win_tenderer = ""
  84. win_bid_price = ""
  85. linklist_count = 0
  86. for _key in dict_pack.keys():
  87. if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  88. extract_count += 1
  89. if bidding_budget=="":
  90. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  91. for _role in dict_pack[_key]["roleList"]:
  92. if isinstance(_role,list):
  93. extract_count += 1
  94. if _role[2]!='' and float(_role[2])>0:
  95. extract_count += 1
  96. if _role[0]=="tenderee":
  97. tenderee = _role[1]
  98. if _role[0]=="win_tenderer":
  99. if _role[1] is not None and _role[1]!="":
  100. extract_count += 2
  101. if win_tenderer=="":
  102. win_tenderer = _role[1]
  103. if _role[2]!='' and float(_role[2])>0:
  104. extract_count += 2
  105. if win_bid_price=="":
  106. win_bid_price = str(float(_role[2]))
  107. if _role[0]=="agency":
  108. agency = _role[1]
  109. if isinstance(_role,dict):
  110. extract_count += 1
  111. if "role_money" in _role:
  112. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  113. extract_count += 1
  114. if _role.get("role_name")=="tenderee":
  115. tenderee = _role["role_text"]
  116. if _role.get("role_name")=="win_tenderer":
  117. if _role["role_text"] is not None and _role["role_text"]!="":
  118. extract_count += 2
  119. if win_tenderer=="":
  120. win_tenderer = _role["role_text"]
  121. if "role_money" in _role:
  122. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  123. extract_count += 2
  124. if win_bid_price=="":
  125. win_bid_price = str(float(_role["role_money"]["money"]))
  126. if _role["role_name"]=="agency":
  127. agency = _role["role_text"]
  128. linklist = _role.get("linklist",[])
  129. for link in linklist:
  130. for l in link:
  131. if l!="":
  132. linklist_count += 1
  133. extract_count += linklist_count//2
  134. if project_code!="":
  135. extract_count += 1
  136. if project_name!="":
  137. extract_count += 1
  138. if page_attachments is not None and page_attachments!='':
  139. try:
  140. _attachments = json.loads(page_attachments)
  141. set_md5 = set()
  142. has_zhaobiao = False
  143. has_qingdan = False
  144. if len(_attachments)>0:
  145. for _atta in _attachments:
  146. classification = _atta.get("classification","")
  147. set_md5.add(_atta.get("fileMd5"))
  148. if str(classification)=='招标文件':
  149. has_zhaobiao = True
  150. if str(classification)=='采购清单':
  151. has_qingdan = True
  152. extract_count += len(set_md5)//2+1
  153. if has_zhaobiao:
  154. extract_count += 2
  155. if has_qingdan:
  156. extract_count += 1
  157. except Exception as e:
  158. traceback.print_exc()
  159. pass
  160. list_approval_dict = _extract.get("approval",[])
  161. for _dict in list_approval_dict:
  162. for k,v in _dict.items():
  163. if v is not None and v!='' and v!="未知":
  164. extract_count += 1
  165. punish_dict = _extract.get("punish",{})
  166. for k,v in punish_dict.items():
  167. if v is not None and v!='' and v!="未知":
  168. extract_count += 1
  169. if web_source_name in set_login_web:
  170. extract_count -= 3
  171. product = _extract.get("product","")
  172. extract_count += len(str(product).split(","))//5
  173. return extract_count
  174. # 字符编码标准化
  175. def str_normalize(text):
  176. # time1 = time.time()
  177. cn_punctuation = "¥,。:;{}!?()<"
  178. text_split = re.split("([{}])+".format(cn_punctuation),text)
  179. # print(text_split)
  180. new_text = ""
  181. for s in text_split:
  182. if re.search("^[{}]+$".format(cn_punctuation),s):
  183. new_text += s
  184. else:
  185. new_text += normalize('NFKD', s)
  186. # print("str_normalize cost time %s"%str(time.time()-time1))
  187. # print(new_text)
  188. return new_text
  189. # 修复prem中地区前缀不完整实体
  190. def repair_entity(prem,district_dict,list_articles):
  191. district_dict = district_dict['district']
  192. province = district_dict['province'] if district_dict['province'] and district_dict['province'] not in ['未知','全国'] else ""
  193. city = district_dict['city'] if district_dict['city'] and district_dict['city']!='未知' else ""
  194. district = district_dict['district'] if district_dict['district'] and district_dict['district']!='未知' else ""
  195. content_text = list_articles[0].content
  196. autonomous_region_dict = {
  197. "新疆":"新疆维吾尔",
  198. "西藏":"西藏",
  199. "内蒙古":"内蒙古",
  200. "广西":"广西壮族",
  201. "宁夏":"宁夏回族"
  202. }
  203. for package,_prem in prem[0]['prem'].items():
  204. for role in _prem['roleList']:
  205. if role['role_name'] in ['tenderee','agency']:
  206. role_text = role['role_text']
  207. if re.search("^[省市县区]",role_text):
  208. if role_text[0]=='省' and role_text[:2] not in ['省道']:
  209. role['role_text'] = province + role_text
  210. elif role_text[0]=='市' and role_text[:2] not in ['市政','市场']:
  211. if district+'市' in content_text:
  212. # 县级市
  213. role['role_text'] = district + role_text
  214. else:
  215. role['role_text'] = city + role_text
  216. elif role_text[0] in ['县','区']:
  217. role['role_text'] = district + role_text
  218. elif re.search("^自治[区州县]",role_text):
  219. if role_text[:3]=='自治区':
  220. role['role_text'] = autonomous_region_dict.get(province,"") + role_text
  221. elif role_text[:3] in ['自治县',"自治州"]:
  222. if re.search("自治[县州]?$",district):
  223. role['role_text'] = re.sub("自治[县州]?","",district) + role_text
  224. elif re.search("族$",district):
  225. role['role_text'] = district + role_text
  226. elif re.search("自治[县州]?$",city):
  227. role['role_text'] = re.sub("自治[县州]?","",city) + role_text
  228. elif re.search("族$",city):
  229. role['role_text'] = city + role_text
  230. def fix_table_structure_preserve_order(html):
  231. """
  232. 修复table结构中tr与tbody平级的问题
  233. 保持原有行顺序不变
  234. """
  235. soup = BeautifulSoup(html, 'html.parser')
  236. for table in soup.find_all('table'):
  237. if table.find_all('tr', recursive=False) != []:
  238. # 获取table下所有直接子节点
  239. children = list(table.children)
  240. tbody_new = soup.new_tag('tbody')
  241. table.append(tbody_new)
  242. for child in children:
  243. if child.name:
  244. if child.name == 'tbody':
  245. for tag in list(child.children):
  246. tbody_new.append(tag.extract())
  247. child.extract()
  248. else:
  249. tbody_new.append(child.extract())
  250. return str(soup)
  251. def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
  252. cost_time = dict()
  253. if web_source_no == None:
  254. web_source_no = ''
  255. if web_source_name == None:
  256. web_source_name = ''
  257. start_time = time.time()
  258. log("start process doc %s"%(str(doc_id)))
  259. # 字符编码标准化
  260. text = str_normalize(text)
  261. text = fix_table_structure_preserve_order(text) # 20250331 修复表格tr tbody平级问题
  262. list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
  263. log("get preprocessed done of doc_id%s"%(doc_id))
  264. cost_time["preprocess"] = round(time.time()-start_time,2)
  265. cost_time.update(_cost_time)
  266. '''大纲提取及大纲内容相关提取'''
  267. start_time = time.time()
  268. sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
  269. parse_document = ParseDocument(text, True,list_obj=sentence2_list)
  270. requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy = extract_parameters(parse_document)
  271. if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
  272. parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
  273. requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy = extract_parameters(parse_document)
  274. # print('out_lines',out_lines)
  275. # if addr_bidopen_text == '':
  276. # addr_bidopen_text = extract_addr(list_articles[0].content)
  277. addr_dic, time_dic, code_investment = predictor.getPredictor('entity_type_rule').predict(list_entitys, list_sentences, list_articles)
  278. if addr_bidopen_text != '' and 'addr_bidopen' not in addr_dic:
  279. addr_dic['addr_bidopen'] = addr_bidopen_text
  280. if addr_bidsend_text != '' and 'addr_bidsend' not in addr_dic:
  281. addr_dic['addr_bidsend'] = addr_bidsend_text
  282. log("get outline done of doc_id%s"%(doc_id))
  283. cost_time["outline"] = round(time.time()-start_time,2)
  284. '''从 kvtree 正则匹配要素'''
  285. start_time = time.time()
  286. kv_single_dic, kv_addr_dic = get_kvtree_value(text)
  287. log("get kvtree done of doc_id%s"%(doc_id))
  288. cost_time["kvtree"] = round(time.time()-start_time,2)
  289. # 过滤掉Redis里值为0的错误实体
  290. # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
  291. # #依赖句子顺序
  292. # start_time = time.time() # 公告类型/生命周期提取 此处作废 换到后面预测 2022/4/29
  293. # channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0],
  294. # web_source_no=web_source_no,original_docchannel=original_docchannel)
  295. # cost_time["channel"] = round(time.time()-start_time,2)
  296. start_time = time.time() # 项目编号、名称提取
  297. codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
  298. if re.search('破产清算案', title):
  299. end = re.search('破产清算案', title).end()
  300. codeName[0]['name'] = title[:end]
  301. log("get codename done of doc_id%s"%(doc_id))
  302. cost_time["codename"] = round(time.time()-start_time,2)
  303. start_time = time.time() # 公告类别预测
  304. channel_dic, msc = predictor.getPredictor("channel").predict_merge(title, list_sentences[0], text,original_docchannel, web_source_no)
  305. cost_time["rule_channel"] = round(time.time() - start_time, 2)
  306. start_time = time.time() # 角色金额模型提取
  307. predictor.getPredictor("prem").predict(list_sentences,list_entitys)
  308. log("get prem done of doc_id%s"%(doc_id))
  309. cost_time["prem"] = round(time.time()-start_time,2)
  310. # start_time = time.time() # 产品名称及废标原因提取 此处作废 换到后面预测 2022/4/29
  311. # fail = channel_dic['docchannel']['docchannel'] == "废标公告"
  312. # fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
  313. # # predictor.getPredictor("product").predict(list_sentences, list_entitys)
  314. # log("get product done of doc_id%s"%(doc_id))
  315. # cost_time["product"] = round(time.time()-start_time,2)
  316. start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间
  317. product_attrs, total_product_money = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
  318. log("get product attributes done of doc_id%s"%(doc_id))
  319. cost_time["product_attrs"] = round(time.time()-start_time,2)
  320. # 是否为存款类项目
  321. deposit_project = is_deposit_project(title, codeName[0]['name'], requirement_text)
  322. start_time = time.time() #正则角色提取
  323. predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName, channel_dic, all_winner=is_all_winner(title), req_scope=requirement_scope, deposit_project=deposit_project)
  324. cost_time["rule"] = round(time.time()-start_time,2)
  325. '''正则补充最后一句实体日期格式为招标或代理 2021/12/30;正则最后补充角色及去掉包含 公共资源交易中心 的招标人'''
  326. start_time = time.time() #正则角色提取
  327. predictor.getPredictor("roleRuleFinal").predict(list_articles,list_sentences,list_entitys, codeName)
  328. cost_time["roleRuleFinal"] = round(time.time()-start_time,2)
  329. start_time = time.time() #正则招标人召回
  330. predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName)
  331. cost_time["tendereeRuleRecall"] = round(time.time()-start_time,2)
  332. '''规则调整角色概率'''
  333. start_time = time.time() #
  334. predictor.getPredictor("rolegrade").predict(list_sentences,list_entitys,original_docchannel)
  335. cost_time["rolegrade"] = round(time.time()-start_time,2)
  336. '''规则调整金额概率'''
  337. start_time = time.time() #
  338. predictor.getPredictor("moneygrade").predict(list_sentences,list_entitys)
  339. cost_time["moneygrade"] = round(time.time()-start_time,2)
  340. start_time = time.time() #联系人模型提取
  341. predictor.getPredictor("epc").predict(list_sentences,list_entitys)
  342. log("get epc done of doc_id%s"%(doc_id))
  343. cost_time["person"] = round(time.time()-start_time,2)
  344. start_time = time.time() # 时间类别提取
  345. predictor.getPredictor("time").predict(list_sentences, list_entitys)
  346. log("get time done of doc_id%s"%(doc_id))
  347. cost_time["time"] = round(time.time()-start_time,2)
  348. start_time = time.time() # 保证金支付方式
  349. payment_way_dic = predictor.getPredictor("deposit_payment_way").predict(content=list_articles[0].content)
  350. cost_time["deposit"] = round(time.time()-start_time,2)
  351. # 需在getPredictor("prem").predict后 getAttributes.getPREMs 前 规则调整 监理|施工|设计|勘察类别公告的费用 为招标或中标金额
  352. predictor.getPredictor("prem").correct_money_by_rule(title, list_entitys, list_articles)
  353. # 2021-12-29新增:提取:总价,单价
  354. start_time = time.time() # 总价单价提取
  355. predictor.getPredictor("total_unit_money").predict(list_sentences, list_entitys)
  356. cost_time["total_unit_money"] = round(time.time()-start_time, 2)
  357. # 依赖句子顺序
  358. start_time = time.time() # 实体链接
  359. entityLink.link_entitys(list_entitys)
  360. doctitle_refine = entityLink.doctitle_refine(title)
  361. nlp_enterprise,nlp_enterprise_attachment, dict_enterprise = entityLink.get_nlp_enterprise(list_entitys[0])
  362. prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time)
  363. log("get attributes done of doc_id%s"%(doc_id))
  364. cost_time["attrs"] = round(time.time()-start_time,2)
  365. if original_docchannel != 302: # 审批项目不做下面提取
  366. '''表格要素提取'''
  367. table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise+nlp_enterprise_attachment, web_source_name, is_all_winner(title))
  368. # print('表格提取中标人:', table_prem)
  369. # print('原提取角色:', prem[0]['prem'])
  370. if table_prem:
  371. getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem, in_attachment=in_attachment)
  372. '''候选人提取'''
  373. candidate_top3_prem, candidate_dic, in_attachment = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise+nlp_enterprise_attachment)
  374. # print('表格提取候选人:', candidate_top3_prem)
  375. getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem, in_attachment=in_attachment)
  376. '''获取联合体信息'''
  377. getAttributes.get_win_joint(prem, list_entitys, list_sentences, list_articles)
  378. '''修正采购公告表格形式多种采购产品中标价格;中标金额小于所有产品总金额则改为总金额'''
  379. getAttributes.correct_rolemoney(prem, total_product_money, list_articles)
  380. '''修正channel预测类别为招标公告却有中标人及预测为中标信息却无中标关键词的类别''' # 依赖 prem
  381. start_time = time.time()
  382. # content = list_articles[0].content
  383. # channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
  384. if original_docchannel == 302:
  385. channel_dic = {"docchannel":
  386. { "docchannel": "审批项目", "doctype": "审批项目", "life_docchannel": "审批项目" }
  387. }
  388. else:
  389. channel_dic, msc = predictor.getPredictor("channel").final_change(channel_dic, prem[0], original_docchannel, msc)
  390. # print('msc', msc)
  391. channel_dic = merge_channel(list_articles,channel_dic,original_docchannel) # channel_dic 根据新模型预测结合判断,整合结果
  392. cost_time["rule_channel2"] = round(time.time()-start_time,2)
  393. '''一包多中标人提取及所有金额提取'''
  394. all_moneys = getAttributes.get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, is_all_winner(title))
  395. start_time = time.time() # 产品名称及废标原因提取 #依赖 docchannel结果
  396. fail = channel_dic['docchannel']['docchannel'] == "废标公告"
  397. fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail,out_lines=out_lines) #只返回失败原因,产品已加入到Entity类 #2022/7/29补充返回产品,方便行业分类调用
  398. # predictor.getPredictor("product").predict(list_sentences, list_entitys)
  399. log("get product done of doc_id%s"%(doc_id))
  400. cost_time["product"] = round(time.time()-start_time,2)
  401. prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time,prem,channel_dic))
  402. '''更新单一来源招标公告中标角色为预中标'''
  403. getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel)
  404. '''公告无表格格式时,采购意向预测''' #依赖 docchannel结果 依赖产品及prem
  405. '''把产品要素提取结果在项目名称的添加到 采购需求,预算时间,采购时间 要素中'''
  406. predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time)
  407. '''行业分类提取,需要用标题、项目名称、产品、及prem 里面的角色'''
  408. industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem, product_attrs=product_attrs)
  409. '''地区获取'''
  410. start_time = time.time()
  411. # district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
  412. district = predictor.getPredictor('district').predict_area(title, list_articles[0].content, web_source_name, prem=prem[0]['prem'], addr_dic=addr_dic, list_entity=list_entitys[0])
  413. cost_time["district"] = round(time.time() - start_time, 2)
  414. '''根据district提取结果修复实体'''
  415. repair_entity(prem,district,list_articles)
  416. '''根据数据源最后召回招标人角色'''
  417. prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(web_source_no, web_source_name, prem)
  418. '''根据关键词表生成项目标签'''
  419. project_label = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem)
  420. # 额外需求的标签
  421. project_label = predictor.getPredictor('project_label').predict_other(project_label,industry,title,codeName[0]['name'],','.join(product_list),list_articles)
  422. # print(project_label)
  423. '''产权分类二级标签'''
  424. property_label = predictor.getPredictor('property_label').predict(title, product=','.join(product_list),project_name=codeName[0]['name'], prem=prem,channel_dic=channel_dic)
  425. '''最终验证prem'''
  426. getAttributes.confirm_prem(prem[0]['prem'], channel_dic, deposit_project, prem[0]['total_tendereeMoney'])
  427. '''通过产品补充标段包名20241203'''
  428. getAttributes.add_package_name(prem[0]['prem'], list_entitys[0], product_list, name=codeName[0]['name'])
  429. # 提取拟在建所需字段
  430. start_time = time.time()
  431. pb_json = predictor.getPredictor('pb_extract').predict(prem, list_articles, list_sentences, list_entitys, title, codeName[0], text, web_source_name, industry)
  432. log("pb_extract done of doc_id%s"%(doc_id))
  433. cost_time["pb_extract"] = round(time.time() - start_time, 2)
  434. '''打标签'''
  435. label_dic = get_all_label(title, list_articles[0].content, prem[0]['prem'])
  436. '''评标评分提取'''
  437. bid_score = predictor.getPredictor('bid_score').predict(text, nlp_enterprise+nlp_enterprise_attachment)
  438. # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
  439. # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
  440. version_date = {'version_date': '2025-04-22'}
  441. data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
  442. if original_docchannel == 302:
  443. approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys, text, nlp_enterprise=nlp_enterprise+nlp_enterprise_attachment)
  444. approval = predictor.getPredictor("approval").add_ree2approval(approval , prem[0]['prem'])
  445. approval = predictor.getPredictor("approval").add_codename2approval(approval , codeName)
  446. data_res['prem'] = {} # 审批项目不要这项
  447. data_res['approval'] = approval[:100] # 20250217 限制获取最多100个项目
  448. if web_source_no == 'XM6486':
  449. debt_dic = get_debt_info(text) # 专项债信息提取
  450. if debt_dic.get('district', '') != '':
  451. district = predictor.getPredictor('district').predict_area(debt_dic['district'], '', web_source_name)
  452. debt_dic['district'] = district['district']
  453. data_res['district'] = district['district']
  454. # 提取专项债信息
  455. data_res['debt_dic'] = debt_dic
  456. data_res['docchannel'] = { "docchannel": "审批项目", "doctype": "审批项目", "life_docchannel": "审批项目" }
  457. if channel_dic['docchannel']['doctype'] == '处罚公告': # 20240627 处罚公告进行失信要素提取
  458. start_time = time.time() #失信数据要素提取
  459. punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
  460. cost_time["punish"] = round(time.time()-start_time,2)
  461. data_res['punish'] = punish_dic
  462. if "Project" in data_res['prem']:
  463. for d in data_res['prem']['Project']['roleList']:
  464. if d['role_name'] == 'tenderee' and d.get('role_prob', 0.6) < 0.6: # 处罚公告 去掉低概率招标人
  465. data_res['prem']['Project']['roleList'] = [d for d in data_res['prem']['Project']['roleList'] if d['role_name'] != 'tenderee']
  466. break
  467. if len(data_res['prem']['Project']['roleList']) == 0 and data_res['prem']['Project'].get('tendereeMoney', 0) in [0, '0']: # 删除空包
  468. data_res['prem'].pop('Project')
  469. # 把产品属性里面的产品补充到产品列表
  470. if len(data_res['product_attrs']['data']) > 0: # 20241108 如果产品单价数量提取到产品的,原来提取的产品只保留标题中的
  471. data_res['product'] = [it for it in data_res['product'] if it in title]
  472. for d in data_res['product_attrs']['data']:
  473. if isinstance(d['product'], str) and d['product'] not in data_res['product']:
  474. data_res['product'].append(d['product'])
  475. '''最终检查修正招标、中标金额'''
  476. getAttributes.limit_maximum_amount(data_res, list_entitys[0])
  477. '''利用采购意向需求信息补充项目'''
  478. if channel_dic['docchannel']['docchannel'] == '采购意向':
  479. getAttributes.demand_to_prem(data_res.get('demand_info', {}), prem[0]['prem'])
  480. data_res["project_label"] = project_label
  481. data_res["property_label"] = property_label
  482. data_res["doctitle_refine"] = doctitle_refine
  483. data_res["nlp_enterprise"] = nlp_enterprise
  484. data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
  485. data_res["dict_enterprise"] = dict_enterprise
  486. # 要素的个数
  487. data_res['extract_count'] = extractCount(data_res,page_attachments,web_source_name)
  488. # 是否有表格
  489. data_res['exist_table'] = 1 if re.search("<td",text) else 0
  490. data_res["cost_time"] = cost_time
  491. data_res["success"] = True
  492. # 拟在建需建索引字段
  493. data_res["proportion"] = pb_json.get('pb').get('proportion', '')
  494. data_res["pb_project_name"] = pb_json.get('pb').get('project_name_refind', '')
  495. # 资质要求
  496. data_res['aptitude'] = aptitude_text[:1500]
  497. # 采购内容
  498. data_res['requirement'] = requirement_text[:1500]
  499. # 打标签
  500. data_res['label_dic'] = label_dic
  501. # 开标、投标、项目、收货等地址
  502. data_res['addr_dic'] = addr_dic
  503. # 字数
  504. text_main, text_attn = 0, 0
  505. for sentence in list_sentences[0]:
  506. if sentence.in_attachment:
  507. text_attn += len(sentence.sentence_text)
  508. else:
  509. text_main += len(sentence.sentence_text)
  510. data_res['word_count'] = {'正文': text_main, '附件': text_attn}
  511. # 限制产品数量
  512. data_res['product'] = data_res['product'][:500]
  513. data_res['product_attrs']['data'] = data_res['product_attrs']['data'][:500]
  514. # 是否为存款项目
  515. data_res['is_deposit_project'] = deposit_project
  516. data_res['pinmu_name'] = pinmu_name # 品目名称
  517. data_res['policies'] = list_policy # 政策法规
  518. data_res['bid_score'] = bid_score # 评标得分
  519. data_res['time_planned'] = time_dic.get('time_planned', '') # 预计招标时间
  520. data_res['code_investment'] = code_investment # 投资项目编号
  521. for k, v in kv_single_dic.items(): # 没获取到的用kv_tree补充
  522. if data_res.get(k, '') == '':
  523. data_res[k] = v
  524. for k, v in kv_addr_dic.items(): # 没获取到地址的用kv_tree补充
  525. if data_res['addr_dic'].get(k, '') == '' or re.search('时间:', data_res['addr_dic'][k]):
  526. data_res['addr_dic'][k] = v
  527. # for _article in list_articles:
  528. # log(_article.content)
  529. #
  530. # for list_entity in list_entitys:
  531. # for _entity in list_entity:
  532. # log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
  533. # (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
  534. # str(_entity.begin_index),str(_entity.end_index)))
  535. _extract_json = json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
  536. _extract_json = _extract_json.replace("\x06", "").replace("\x05", "").replace("\x07", "")
  537. return _extract_json#, list_articles[0].content, get_ent_context(list_sentences, list_entitys)
  538. def test1(name,content):
  539. user = {
  540. "content": content,
  541. "id":name
  542. }
  543. myheaders = {'Content-Type': 'application/json'}
  544. _resp = requests.post("http://192.168.2.102:15030" + '/article_extract', json=user, headers=myheaders, verify=True)
  545. resp_json = _resp.content.decode("utf-8")
  546. # print(resp_json)
  547. return resp_json
  548. def get_ent_context(list_sentences, list_entitys):
  549. rs_list = []
  550. sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
  551. for list_entity in list_entitys:
  552. for _entity in list_entity:
  553. if _entity.entity_type in ['org', 'company', 'money']:
  554. s = sentences[_entity.sentence_index].sentence_text
  555. b = _entity.wordOffset_begin
  556. e = _entity.wordOffset_end
  557. # print("%s %d %.4f; %s %s %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
  558. rs_list.append("%s %d %.4f; %s ## %s ## %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
  559. return '\n'.join(rs_list)
  560. def get_role_context(docid, list_sentences, list_entitys):
  561. rs_list = []
  562. sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
  563. for list_entity in list_entitys:
  564. for _entity in list_entity:
  565. if _entity.entity_type in ['org', 'company']:
  566. idx = _entity.entity_id
  567. sentence = sentences[_entity.sentence_index]
  568. # _span = spanWindow(tokens=sentence.tokens, begin_index=_entity.begin_index, end_index=_entity.end_index, size=20,
  569. # center_include=False, word_flag=True, text=_entity.entity_text)
  570. _span = get_context(sentence.sentence_text, _entity.wordOffset_begin, _entity.wordOffset_end, size=40, center_include=False)
  571. rs_list.append((docid,idx, _entity.entity_type, _entity.label, '%.4f'%_entity.values[_entity.label], _span[0],
  572. _entity.entity_text, _span[1]))
  573. return rs_list
  574. if __name__=="__main__":
  575. import pandas as pd
  576. t1 = time.time()
  577. # text = '中标人:广州中医药有限公司,招标人:广州市第一人民医院, 代理机构:希达招标代理有限公司。招标金额:100万元, 手续费:100元,总投资:1亿元。中标金额:50000元。合同金额:50000万元。'
  578. title = '打印机'
  579. # df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx')
  580. # # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx')
  581. # for i in range(30,50,1):
  582. # text = df.loc[i, 'dochtmlcon']
  583. # rs = json.loads(predict('', text, ''))
  584. # print(rs['demand_info'])
  585. # print(rs['product'])
  586. # print(rs['product_attrs'])
  587. # print(rs)
  588. # df2 = pd.read_csv('E:/导出数据/存款入围框架采购等公告_输入要素.csv')
  589. # df = pd.read_csv('E:\导出数据/存款入围框架采购等公告_预测结果0830.csv')
  590. # df1 = pd.read_csv('E:\导出数据/存款入围框架采购等公告_html.csv')
  591. # df = df.merge(df1, on='docid', how='left')
  592. # print(len(df), df.columns)
  593. # df['rs'] = df['extract_json1'].apply(lambda x: json.loads(x))
  594. # docids = []
  595. # n1 = n2 = 0
  596. # for docid, d, html in zip(df['docid'], df['rs'], df['dochtmlcon']):
  597. # if d['docchannel']['docchannel'] == '招标公告' and '元' in html and 'Project' in d['prem'] and float(
  598. # d['prem']['Project']['tendereeMoney']) == 0:
  599. # docids.append(docid)
  600. # n1 += 1
  601. # else:
  602. # n2 += 1
  603. # print(n1, n2)
  604. # df = df[df['docid'].isin(docids)]
  605. #
  606. # df = df.merge(df2, on='docid', how='left')
  607. # df.fillna('', inplace=True)
  608. # df = df[['docid', 'doctitle', 'page_time', 'web_source_no', 'web_source_name', 'original_docchannel', 'dochtmlcon']]
  609. # print(df.columns)
  610. # # df = df[:10]
  611. # print(len(df))
  612. #
  613. # l = []
  614. # for docid, text, title, page_time,web_no, web_name, channel in zip(df['docid'], df['dochtmlcon'], df['doctitle'], df['page_time'],
  615. # df['web_source_no'], df['web_source_name'], df['original_docchannel']):
  616. # rs, content, roles = predict('', text, title, page_time, web_no, web_name, channel)
  617. # l.append((docid, rs, content, roles))
  618. # df = pd.DataFrame(l, columns=['docid', 'rs', 'content', 'roles'])
  619. # df.to_csv('E:\导出数据/存款入围框架采购等公告_招标公告无招标金额预测结果.csv')
  620. df = pd.read_csv('E:\角色金额数据/银行类招标金额缺失公告_输入要素.csv')
  621. # df2 = pd.read_csv('E:/角色金额数据/银行缺招标金额公告_3_html.csv')
  622. df2 = pd.read_csv('E:/角色金额数据/银行缺招标金额公告_012_html.csv')
  623. print(len(df), len(df2))
  624. df = df[df['docid'].isin(df2['docid'])]
  625. print(len(df))
  626. df = df.merge(df2, how='left', on='docid')
  627. print(len(df))
  628. # df1 = pd.read_excel('E:\角色金额数据/银行缺招标金额公告_检查汇总2.xlsx')
  629. # # df1 = pd.read_excel('E:\角色金额数据/银行缺招标金额公告_检查汇总2_补充招标内容金额后1105.xlsx')
  630. # df1.fillna('', inplace=True)
  631. # df1 = df1[df1['tendereeMoney']==0]
  632. # df = df.merge(df1, on='docid', how='right')
  633. df = df[['docid', 'doctitle', 'page_time', 'web_source_no', 'web_source_name', 'original_docchannel', 'dochtmlcon']]
  634. print(df.columns)
  635. # df = df[:10]
  636. print(len(df))
  637. l = []
  638. for docid, text, title, page_time,web_no, web_name, channel in zip(df['docid'], df['dochtmlcon'], df['doctitle'], df['page_time'],
  639. df['web_source_no'], df['web_source_name'], df['original_docchannel']):
  640. rs, content, roles = predict('', text, title, page_time, web_no, web_name, channel)
  641. l.append((docid, rs, content, roles))
  642. df = pd.DataFrame(l, columns=['docid', 'rs', 'content', 'roles'])
  643. # df.to_csv('E:\角色金额数据/银行缺招标金额公告_3预测结果.csv')
  644. # df.to_csv('E:\角色金额数据/银行缺招标金额公告_012预测结果.csv')
  645. # df.to_csv('E:\角色金额数据/银行缺招标金额公告_检查汇总2_修复后预测结果.csv')
  646. df.to_csv('E:\角色金额数据/银行缺招标金额公告_检查汇总2_修复后预测结果1106.csv')
  647. with open('2.html', 'r', encoding='utf-8') as f:
  648. text = f.read()
  649. t1 = time.time()
  650. print(predict('', text, title))
  651. t2 = time.time()
  652. print(predict('', text, title))
  653. t3 = time.time()
  654. print('第一次耗时:%.4f, 第二次耗时:%.4f'%(t2-t1, t3-t2))
  655. print(predict('',text,title))
  656. # df = pd.read_excel('E:/大网站规则识别/大网站要素提取结果2.xlsx')[:]
  657. # df = pd.read_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115_2.xlsx')[:]
  658. # new_prem = []
  659. # for i in range(len(df)):
  660. # i = 530
  661. # doc_id = df.loc[i, 'docid']
  662. # text = df.loc[i, 'html']
  663. # # title = df.loc[i, 'doctitle']
  664. # rs = predict(doc_id,text)
  665. # rs = json.loads(rs)
  666. # prem = json.dumps(rs['prem'], ensure_ascii=False)
  667. # # print(rs)
  668. # new_prem.append(prem)
  669. # print(prem)
  670. # break
  671. # df['new_prem'] = pd.Series(new_prem)
  672. # print('耗时:', time.time()-t1)
  673. # # df.to_excel('E:/大网站规则识别/大网站要素提取结果20211115.xlsx')
  674. # df.to_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115.xlsx')
  675. # # pass
  676. # import json
  677. # a = json.loads('''
  678. # { "addr_dic": { "addr_bidopen": "洛阳市公共资源交易中心开标四室" }, "aptitude": "二、申请人资格要求,1、满足《中华人民共和国政府采购法》第二十二条规定。2、落实政府采购政策满足的资格要求:2.1本项目面向中小微(监狱、残疾人福利性单位)企业,节能环保产品优先或强制采购;本项,目执行节约能源、保护环境、扶持不发达地区和少数民族地区等政府采购政策。2.2根据洛财购[2021]4号文件要求,参加政府采购项目的中小企业供应商,持中标(成交)通知,书可向金融机构申请合同融资。详情请登录洛阳市政府采购网(http://luoyang.hngp.gov.cn/),进入网站通知公告窗口了解金融机构提供的融资服务内容。3、本项目的特定资格要求:3.1投标人须具有独立承担民事责任的能力,持有有效的营业执照或事业单位法人证书(投标文,件中须附相关证件的原件扫描件,并加盖单位公章)。3.2根据洛财购[2021]11号文件,投标人须按照规定提供“洛阳市政府采购投标人信用承诺函”,(详见附件,采购人有权在签订合同前要求中标投标人提供相关证明材料以核实中标投标人承诺事,项的真实性。3.3投标人拟派项目负责人需提供单位2024年1月以来任意连续3个月社会保险证明(以社会,保障部门出具的社保证明或社保中心网页截图为准)。3.4本次招标实行资格后审,资格不合格者,取消其投标资格。二、申请人资格要求,1、满足《中华人民共和国政府采购法》第二十二条规定。2、落实政府采购政策满足的资格要求:2.1本项目面向中小微(监狱、残疾人福利性单位)企业,节能环保产品优先或强制采购;本项,目执行节约能源、保护环境、扶持不发达地区和少数民族地区等政府采购政策。2.2根据洛财购[2021]4号文件要求,参加政府采购项目的中小企业供应商,持中标(成交)通知,书可向金融机构申请合同融资。详情请登录洛阳市政府采购网(http://luoyang.hngp.gov.cn/),进入网站通知公告窗口了解金融机构提供的融资服务内容。3、本项目的特定资格要求:3.1投标人须具有独立承担民事责任的能力,持有有效的营业执照或事业单位法人证书(投标文,件中须附相关证件的原件扫描件,并加盖单位公章)。3.2根据洛财购[2021]11号文件,投标人须按照规定提供“洛阳市政府采购投标人信用承诺函”,(详见附件,采购人有权在签订合同前要求中标投标人提供相关证明材料以核实中标投标人承诺事,项的真实性。3.3投标人拟派项目负责人需提供单位2024年1月以来任意连续3个月社会保险证明(以社会,保障部门出具的社保证明或社保中心网页截图为准)。3.4本次招标实行资格后审,资格不合格者,取消其投标资格。", "attachmentTypes": "pdf", "bid_score": [ { "ranking": "1", "score_comprehensive": "84.40", "tenderer": "天津布尔科技有限公司" }, { "ranking": "2", "score_comprehensive": "62.83", "tenderer": "河南铭服云信息科技有限公司" }, { "ranking": "3", "score_comprehensive": "60.84", "tenderer": "河南拓普计算机网络工程有限公司" }, { "ranking": "4", "score_comprehensive": "45.71", "tenderer": "郑州市数字政通信息技术有限公司" } ], "bidway": "竞争性谈判", "candidate": "天津布尔科技有限公司,河南铭服云信息科技有限公司,郑州市数字政通信息技术有限公司,河南拓普计算机网络工程有限公司", "code": [ "洛直政采招标(2025)0014号", "洛采公开-2025-11", "洛直政采招标(2025)0014号-1" ], "code_investment": "", "cost_time": { "attrs": 0.7, "codename": 0.8, "deposit": 0.0, "district": 1.82, "kvtree": 0.02, "moneygrade": 0.0, "nerToken": 1.74, "outline": 0.23, "pb_extract": 0.31, "person": 0.02, "prem": 0.1, "preprocess": 3.23, "product": 0.97, "product_attrs": 0.14, "roleRuleFinal": 0.01, "rolegrade": 0.01, "rule": 0.05, "rule_channel": 0.03, "rule_channel2": 0.32, "tableToText": 0.8700038146972656, "tendereeRuleRecall": 0.0, "time": 0.03, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "dict_enterprise": { "中华人民共和国": { "in_text": 0 }, "中国政府": { "in_text": 0 }, "中国残疾人联合会": { "credit_code": "131000000000180402", "in_text": 0 }, "中经国际招标集团有限公司": { "credit_code": "9111000075820412XD", "in_text": 2 }, "国家统计局国家发展和改革委员财政部": { "in_text": 0 }, "天津布尔科技有限公司": { "credit_code": "91120116086586515N", "in_text": 0 }, "工信部": { "in_text": 0 }, "戒毒管理局": { "in_text": 0 }, "民政部": { "in_text": 0 }, "河南拓普计算机网络工程有限公司": { "credit_code": "914101057126468974", "in_text": 0 }, "河南政府": { "in_text": 0 }, "河南省政府": { "in_text": 0 }, "河南铭服云信息科技有限公司": { "credit_code": "91410103MA4769Q97P", "in_text": 0 }, "洛阳市公共资源交易中心": { "credit_code": "12410300592433925T", "in_text": 0 }, "洛阳市政府": { "in_text": 0 }, "洛阳市生态环境局": { "credit_code": "11410300005372460J", "in_text": 0 }, "洛阳市生态环境局新安分局": { "in_text": 2 }, "洛阳市财政局": { "credit_code": "11410300005372639N", "in_text": 0 }, "洛阳政府": { "in_text": 0 }, "财政部": { "in_text": 0 }, "郑州市数字政通信息技术有限公司": { "credit_code": "91410100680774789D", "in_text": 0 } }, "district": { "area": "华中", "city": "洛阳", "district": "新安", "is_in_text": false, "province": "河南" }, "docchannel": { "docchannel": "中标信息", "doctype": "采招数据", "life_docchannel": "中标信息", "use_original_docchannel": 0 }, "docid": "", "doctitle_refine": "洛阳市生态环境局新安分局新安县大气污染防治精细化管理咨询项目-", "exist_table": 1, "extract_count": 25, "fail_reason": "", "fingerprint": "md5=22c9f396db7ee5317c3890bdaae63357", "industry": { "class": "租赁和商务服务业", "class_name": "社会经济咨询", "subclass": "商务服务业" }, "is_deposit_project": false, "label_dic": { "consortium_permit": "否", "is_target_small": 1, "mode_of_partipation": 1, "need_ca": 1, "need_performance": 1, "need_qualification": 1 }, "match_enterprise": [ { "from": "洛阳市", "to": "洛阳市公共资源交易中心", "type": "update" }, { "from": "", "to": "洛阳市公共资源交易中心", "type": "add" }, { "from": "洛阳市", "to": "洛阳市生态环境局", "type": "update" } ], "match_enterprise_type": 3, "moneys": [], "moneys_attachment": [ 2200000.0, 101462884.4, 2177000.0, 65831529.13, 2190000.0, 2158000.0, 2175000.0 ], "moneysource": "财政资金", "name": "洛阳市生态环境局新安分局新安县大气污染防治精细化管理咨询服务项目", "nlp_enterprise": [ "洛阳市生态环境局新安分局", "中经国际招标集团有限公司" ], "nlp_enterprise_attachment": [ "洛阳市生态环境局新安分局", "中经国际招标集团有限公司", "洛阳市公共资源交易中心", "河南省政府", "中华人民共和国", "洛阳市政府", "洛阳政府", "洛阳市财政局", "财政部", "戒毒管理局", "民政部", "中国残疾人联合会", "洛阳市生态环境局", "天津布尔科技有限公司", "国家统计局国家发展和改革委员财政部", "工信部", "中国政府", "河南政府", "河南铭服云信息科技有限公司", "郑州市数字政通信息技术有限公司", "河南拓普计算机网络工程有限公司" ], "pb": { "bid_content_type": "工程咨询", "industry": "办公楼", "location": "新安县", "projectDigest": "项目信息;或者利用技术手段对享有相同权限的市场主体提供有差别的信息。审查结果:口有无,序号:18,条款内容:招标公告或者资格预审公告未在指定媒介发布。审查结果:口有无,序号:19,条款内容:故意对递交或者解密投标文件设置障碍", "project_name_refind": "新安县大气污染防治精细化管理咨询服务", "project_property": "新建" }, "pb_project_name": "新安县大气污染防治精细化管理咨询服务", "person_review": [ "雷体艳", "高兴辉", "郭随民", "杜中群", "钱晓苏" ], "pinmu_name": "", "policies": [ "《公平竞争审查制度实施细则》", "《中华人民共和国政府采购法》", "《关于政府采购支持监狱企业发展有关问题的通知》", "《政府采购促进中小企业发展管理办法》", "《中华人民共和国招标投标法》", "《公平竞争审查制度实施细则》", "《中华人民共和国政府采购法》", "《关于政府采购支持监狱企业发展有关问题的通知》", "《政府采购促进中小企业发展管理办法》", "《中华人民共和国招标投标法》" ], "prem": { "Project": { "code": "", "name": "洛阳市生态环境局新安分局新安县大气污染防治精细化管理咨询服务项目", "roleList": [ { "address": "新安县城北万象大道与省道交叉口", "linklist": [ [ "郭先生", "18838818760" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "tenderee", "role_prob": 0.8, "role_text": "洛阳市生态环境局新安分局", "serviceTime": "" }, { "address": "洛阳市洛龙区开元大道258号世贸中心B座2405室", "linklist": [ [ "韩女士", "0379-62220716" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "中经国际招标集团有限公司", "serviceTime": "" }, { "address": "", "linklist": [], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": "2158000.00", "money_unit": "元" }, "role_name": "win_tenderer", "role_prob": 0.8, "role_text": "天津布尔科技有限公司", "serviceTime": "" }, { "address": "", "linklist": [], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": "2177000.00", "money_unit": "元" }, "role_name": "second_tenderer", "role_text": "河南铭服云信息科技有限公司", "serviceTime": "" }, { "address": "", "linklist": [], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": "2175000.00", "money_unit": "元" }, "role_name": "third_tenderer", "role_text": "河南拓普计算机网络工程有限公司", "serviceTime": "" } ], "tendereeMoney": "2200000.00", "tendereeMoneyUnit": "元", "uuid": "55b4ba83-8f70-4b9f-9641-230eccf8c77b" } }, "process_time": "2025-02-28 10:56:05", "product": [ "大气污染防治精细化管理咨询服务项目", "辅助资料表", "大气污染防治精细化管理咨询服务", "颗粒物雷达走航监测服务", "VOCS走航监测服务", "污染排查监测服务" ], "product_attrs": { "data": [], "header": [], "header_col": [] }, "project_contacts": [ [ "韩女士", "0379-62220716" ] ], "project_label": { "标题": { "企业培训": [ [ "管理咨询", 1 ] ], "环境检测及治理": [ [ "污染防治", 1 ] ] }, "核心字段": { "企业培训": [ [ "管理咨询", 4 ] ], "环境检测及治理": [ [ "污染防治", 4 ] ], "空净通": [ [ "污染防治", 1 ] ] } }, "property_label": "", "proportion": "", "requirement": "一、项目基本情况,1、政府采购管理部门备案编号:洛采公开-2025-11,项目编号:洛直政采招标(2025)0014号,2、项目名称:洛阳市生态环境局新安分局新安县大气污染防治精细化管理咨询服务项目,3、采购方式:公开招标,4、预算金额:2200000.00元,最高限价:2200000.00元,包号:洛直政采招标(2025)0014号-1,包名称:洛阳市生态环境局新安分局新安县大气污染防治精细化管理咨询服务项目,序号:1,包预算(元):2200000,包最高限价(元):2200000,是否专门面向中小企业:是,采购预留金额(元):2200000。5、采购需求(包括但不限于标的的名称、数量、简要技术需求或服务要求等:5.1采购范围:新安县大气污染防治精细化管理咨询服务,服务内容:颗粒物雷达走航监测服务、,VOCS走航监测服务、污染排查监测服务、空气质量综合研判平台售后服务及功能完善、大气污染防,治咨询服务等。5.3资金来源:财政资金,5.4服务要求:符合行业相关规定,5.4服务期限:1年,5.4服务地点:采购人指定地点,6、合同履行期限:1年,7、本项目是否接受联合体投标:否。8、是否接受进口产品:否。9.是否专门面向中小企业:是,一、采购项目概况,1、项目名称:洛阳市生态环境局新安分局新安县大气污染防治精细化管理咨询服务项目,2、采购人:洛阳市生态环境局新安分局,3、采购代理机构:中经国际招标集团有限公司,4、项目编号:洛直政采招标(2025)0014号,5、采购方式:采购公开,6、采购信息发布媒体:中国政府采购网、河南政府采购网、洛阳市公共资源交易中心网等,7、招标文件获取份数:10家(详见招标采购文件获取名单),8、开标日期:2025-02-27,9、开标地点:洛阳市公共资源交易中心开标四室,10、评标日期:2025-02-27,11、评标地点:洛阳市公共资源交易中心评标四室,12、其他事项说明:无,一、项目基本情况,1、政府采购管理部门备案编号:洛采公开-2025-11,项目编号:洛直政采招标(2025)0014号,2、项目名称:洛阳市生态环境局新安分局新安县大气污染防治精细化管理咨询服务项目,3、采购方式:公开招标,4、预算金额:2200000.00元,最高限价:2200000.00元,包号:洛直政采招标(2025)0014号-1,包名称:洛阳市生态环境局新安分局新安县大气污染防治精细化管理咨询服务项目,序号:1,包预算(元):2200000,包最高限价(元):2200000,是否专门面向中小企业:是,采购预留金额(元):2200000。5、采购需求(包括但不限于标的的名称、数量、简要技术需求或服务要求等:5.1采购范围:新安县大气污染防治精细化管理咨询服务,服务内容:颗粒物雷达走航监测服务、,VOCS走航监测服务、污染排查监测服务、空气质量综合研判平台售后服务及功能完善、大气污染防,治咨询服务等。5.3资金来源:财政资金,5.4服务要求:符合行业相关规定,5.4服务期限:1年,5.4服务地点:采购人指定地点,6、合同履行期限:1年,7、本项目是否接受联合体投标:否。8、是否接受进口产品:否。9.是否专门面向中小企业:是,一、采购项目概况,1、项目名称:洛阳市生态环境局新安分局新安县大气污染防治精细化管理咨询服务项目,2、采购人:洛阳市生态环境局新安分局,3、采购代理机构:中经国际招标集团有限公司,4、项目编号:洛直政采招标(2025)0014号,5、采购方式:采购公开,6、采购信息发布媒体:中国政府采购网、河南政府采购网、洛阳市公共资源", "serviceTime": { "service_days": 365, "service_end": "", "service_start": "" }, "success": true, "time_bidclose": "2025-02-27 09:05:00", "time_bidopen": "2025-02-27 09:05:00", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_contractEnd": "", "time_contractStart": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "2025-02-12", "time_getFileStart": "2025-02-07 12:00:00", "time_listingEnd": "", "time_listingStart": "", "time_planned": "", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2025-02-28 10:44:00", "time_signContract": "", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "", "version_date": "2025-02-19", "word_count": { "正文": 189, "附件": 23633 } }
  679. # ''')
  680. # b = '''
  681. # [{"fileTitle":"招标文件正文.pdf","fileMd5":"75414b85b0b16df0f2a9aa5ccbf5f010","classification":"招标文件"},{"fileTitle":"中小微企业声明函(投标人).pdf","fileMd5":"794cad9758929c8a8591d59cc0aa8a45","classification":"其他"},{"fileTitle":"洛阳_评标报告.pdf","fileMd5":"079b5ee9bfc6d312f22325a9719936d0","classification":"其他"},{"fileTitle":"信用承诺书.pdf","fileMd5":"005451d6326c66761c3704594368bdd2","classification":"其他"},{"fileTitle":"招标文件正文.pdf","fileMd5":"75414b85b0b16df0f2a9aa5ccbf5f010","classification":"招标文件"},{"fileTitle":"中小微企业声明函(投标人).pdf","fileMd5":"794cad9758929c8a8591d59cc0aa8a45","classification":"其他"},{"fileTitle":"洛阳_评标报告.pdf","fileMd5":"079b5ee9bfc6d312f22325a9719936d0","classification":"其他"},{"fileTitle":"信用承诺书.pdf","fileMd5":"005451d6326c66761c3704594368bdd2","classification":"其他"}]
  682. # '''
  683. # c = "汝阳县政府采购网"
  684. # print(extractCount(a,b,c))