extract.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688
  1. '''
  2. Created on 2019年1月4日
  3. @author: User
  4. '''
  5. import os
  6. from bs4 import BeautifulSoup, Comment
  7. import copy
  8. import re
  9. import sys
  10. import os
  11. import codecs
  12. import requests
  13. import time
  14. from unicodedata import normalize
  15. _time1 = time.time()
  16. sys.path.append(os.path.abspath("../.."))
  17. from BiddingKG.dl.common.Utils import *
  18. import BiddingKG.dl.entityLink.entityLink as entityLink
  19. import BiddingKG.dl.interface.predictor as predictor
  20. import BiddingKG.dl.interface.Preprocessing as Preprocessing
  21. import BiddingKG.dl.interface.getAttributes as getAttributes
  22. import BiddingKG.dl.complaint.punish_predictor as punish_rule
  23. import json
  24. from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
  25. from BiddingKG.dl.ratio.re_ratio import extract_ratio
  26. from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_parameters, extract_sentence_list, extract_addr
  27. from BiddingKG.dl.interface.get_label_dic import get_all_label
  28. # 自定义jsonEncoder
  29. class MyEncoder(json.JSONEncoder):
  30. def default(self, obj):
  31. if isinstance(obj, np.ndarray):
  32. return obj.tolist()
  33. elif isinstance(obj, bytes):
  34. return str(obj, encoding='utf-8')
  35. elif isinstance(obj, (np.float_, np.float16, np.float32,
  36. np.float64)):
  37. return float(obj)
  38. elif isinstance(obj,str):
  39. return obj
  40. return json.JSONEncoder.default(self, obj)
  41. def get_login_web_set():
  42. file = os.path.join(os.path.dirname(__file__),"login_weblist.txt")
  43. list_web = []
  44. try:
  45. if os.path.exists(file):
  46. with open(file,"r",encoding="utf8") as f:
  47. while 1:
  48. line = f.readline()
  49. if not line:
  50. break
  51. line = line.strip()
  52. if line:
  53. list_web.append(line)
  54. except Exception as e:
  55. traceback.print_exc()
  56. _set = set(list_web)
  57. log("get_login_web_set length %d"%(len(_set)))
  58. return _set
  59. set_login_web = get_login_web_set()
  60. def extractCount(extract_dict,page_attachments,web_source_name):
  61. # time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  62. if len(extract_dict):
  63. _extract = extract_dict
  64. else:
  65. _extract = {}
  66. # print(_extract)
  67. dict_pack = _extract.get("prem",{})
  68. extract_count = 0
  69. list_code = _extract.get("code",[])
  70. word_count = _extract.get("word_count",{})
  71. if word_count.get("正文",0)>500:
  72. extract_count += 3
  73. if len(list_code)>0:
  74. project_code = list_code[0]
  75. else:
  76. project_code = ""
  77. project_name = _extract.get("name","")
  78. bidding_budget = ""
  79. win_tenderer = ""
  80. win_bid_price = ""
  81. linklist_count = 0
  82. for _key in dict_pack.keys():
  83. if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  84. extract_count += 1
  85. if bidding_budget=="":
  86. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  87. for _role in dict_pack[_key]["roleList"]:
  88. if isinstance(_role,list):
  89. extract_count += 1
  90. if _role[2]!='' and float(_role[2])>0:
  91. extract_count += 1
  92. if _role[0]=="tenderee":
  93. tenderee = _role[1]
  94. if _role[0]=="win_tenderer":
  95. if _role[1] is not None and _role[1]!="":
  96. extract_count += 2
  97. if win_tenderer=="":
  98. win_tenderer = _role[1]
  99. if _role[2]!='' and float(_role[2])>0:
  100. extract_count += 2
  101. if win_bid_price=="":
  102. win_bid_price = str(float(_role[2]))
  103. if _role[0]=="agency":
  104. agency = _role[1]
  105. if isinstance(_role,dict):
  106. extract_count += 1
  107. if "role_money" in _role:
  108. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  109. extract_count += 1
  110. if _role.get("role_name")=="tenderee":
  111. tenderee = _role["role_text"]
  112. if _role.get("role_name")=="win_tenderer":
  113. if _role["role_text"] is not None and _role["role_text"]!="":
  114. extract_count += 2
  115. if win_tenderer=="":
  116. win_tenderer = _role["role_text"]
  117. if "role_money" in _role:
  118. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  119. extract_count += 2
  120. if win_bid_price=="":
  121. win_bid_price = str(float(_role["role_money"]["money"]))
  122. if _role["role_name"]=="agency":
  123. agency = _role["role_text"]
  124. linklist = _role.get("linklist",[])
  125. for link in linklist:
  126. for l in link:
  127. if l!="":
  128. linklist_count += 1
  129. extract_count += linklist_count//2
  130. if project_code!="":
  131. extract_count += 1
  132. if project_name!="":
  133. extract_count += 1
  134. if page_attachments is not None and page_attachments!='':
  135. try:
  136. _attachments = json.loads(page_attachments)
  137. has_zhaobiao = False
  138. has_qingdan = False
  139. if len(_attachments)>0:
  140. for _atta in _attachments:
  141. classification = _atta.get("classification","")
  142. if str(classification)=='招标文件':
  143. has_zhaobiao = True
  144. if str(classification)=='采购清单':
  145. has_qingdan = True
  146. extract_count += 2
  147. if has_zhaobiao:
  148. extract_count += 2
  149. if has_qingdan:
  150. extract_count += 1
  151. except Exception as e:
  152. traceback.print_exc()
  153. pass
  154. list_approval_dict = _extract.get("approval",[])
  155. for _dict in list_approval_dict:
  156. for k,v in _dict.items():
  157. if v is not None and v!='' and v!="未知":
  158. extract_count += 1
  159. punish_dict = _extract.get("punish",{})
  160. for k,v in punish_dict.items():
  161. if v is not None and v!='' and v!="未知":
  162. extract_count += 1
  163. if web_source_name in set_login_web:
  164. extract_count -= 3
  165. return extract_count
  166. # 字符编码标准化
  167. def str_normalize(text):
  168. # time1 = time.time()
  169. cn_punctuation = "¥,。:;{}!?()<"
  170. text_split = re.split("([{}])+".format(cn_punctuation),text)
  171. # print(text_split)
  172. new_text = ""
  173. for s in text_split:
  174. if re.search("^[{}]+$".format(cn_punctuation),s):
  175. new_text += s
  176. else:
  177. new_text += normalize('NFKD', s)
  178. # print("str_normalize cost time %s"%str(time.time()-time1))
  179. # print(new_text)
  180. return new_text
  181. # 修复prem中地区前缀不完整实体
  182. def repair_entity(prem,district_dict,list_articles):
  183. district_dict = district_dict['district']
  184. province = district_dict['province'] if district_dict['province'] and district_dict['province'] not in ['未知','全国'] else ""
  185. city = district_dict['city'] if district_dict['city'] and district_dict['city']!='未知' else ""
  186. district = district_dict['district'] if district_dict['district'] and district_dict['district']!='未知' else ""
  187. content_text = list_articles[0].content
  188. autonomous_region_dict = {
  189. "新疆":"新疆维吾尔",
  190. "西藏":"西藏",
  191. "内蒙古":"内蒙古",
  192. "广西":"广西壮族",
  193. "宁夏":"宁夏回族"
  194. }
  195. for package,_prem in prem[0]['prem'].items():
  196. for role in _prem['roleList']:
  197. if role['role_name'] in ['tenderee','agency']:
  198. role_text = role['role_text']
  199. if re.search("^[省市县区]",role_text):
  200. if role_text[0]=='省' and role_text[:2] not in ['省道']:
  201. role['role_text'] = province + role_text
  202. elif role_text[0]=='市' and role_text[:2] not in ['市政','市场']:
  203. if district+'市' in content_text:
  204. # 县级市
  205. role['role_text'] = district + role_text
  206. else:
  207. role['role_text'] = city + role_text
  208. elif role_text[0] in ['县','区']:
  209. role['role_text'] = district + role_text
  210. elif re.search("^自治[区州县]",role_text):
  211. if role_text[:3]=='自治区':
  212. role['role_text'] = autonomous_region_dict.get(province,"") + role_text
  213. elif role_text[:3] in ['自治县',"自治州"]:
  214. if re.search("自治[县州]?$",district):
  215. role['role_text'] = re.sub("自治[县州]?","",district) + role_text
  216. elif re.search("族$",district):
  217. role['role_text'] = district + role_text
  218. elif re.search("自治[县州]?$",city):
  219. role['role_text'] = re.sub("自治[县州]?","",city) + role_text
  220. elif re.search("族$",city):
  221. role['role_text'] = city + role_text
  222. def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
  223. cost_time = dict()
  224. if web_source_no == None:
  225. web_source_no = ''
  226. if web_source_name == None:
  227. web_source_name = ''
  228. start_time = time.time()
  229. log("start process doc %s"%(str(doc_id)))
  230. # 字符编码标准化
  231. text = str_normalize(text)
  232. list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
  233. log("get preprocessed done of doc_id%s"%(doc_id))
  234. cost_time["preprocess"] = round(time.time()-start_time,2)
  235. cost_time.update(_cost_time)
  236. '''大纲提取及大纲内容相关提取'''
  237. sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
  238. parse_document = ParseDocument(text, True,list_obj=sentence2_list)
  239. requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope = extract_parameters(parse_document)
  240. if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
  241. parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
  242. requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope = extract_parameters(parse_document)
  243. if addr_bidopen_text == '':
  244. addr_bidopen_text = extract_addr(list_articles[0].content)
  245. # 过滤掉Redis里值为0的错误实体
  246. # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
  247. # #依赖句子顺序
  248. # start_time = time.time() # 公告类型/生命周期提取 此处作废 换到后面预测 2022/4/29
  249. # channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0],
  250. # web_source_no=web_source_no,original_docchannel=original_docchannel)
  251. # cost_time["channel"] = round(time.time()-start_time,2)
  252. start_time = time.time() # 项目编号、名称提取
  253. codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
  254. log("get codename done of doc_id%s"%(doc_id))
  255. cost_time["codename"] = round(time.time()-start_time,2)
  256. start_time = time.time() # 公告类别预测
  257. channel_dic, msc = predictor.getPredictor("channel").predict_merge(title, list_sentences[0], text,original_docchannel, web_source_no)
  258. cost_time["rule_channel"] = round(time.time() - start_time, 2)
  259. start_time = time.time() # 角色金额模型提取
  260. predictor.getPredictor("prem").predict(list_sentences,list_entitys)
  261. log("get prem done of doc_id%s"%(doc_id))
  262. cost_time["prem"] = round(time.time()-start_time,2)
  263. # start_time = time.time() # 产品名称及废标原因提取 此处作废 换到后面预测 2022/4/29
  264. # fail = channel_dic['docchannel']['docchannel'] == "废标公告"
  265. # fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
  266. # # predictor.getPredictor("product").predict(list_sentences, list_entitys)
  267. # log("get product done of doc_id%s"%(doc_id))
  268. # cost_time["product"] = round(time.time()-start_time,2)
  269. start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间
  270. product_attrs, total_product_money = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
  271. log("get product attributes done of doc_id%s"%(doc_id))
  272. cost_time["product_attrs"] = round(time.time()-start_time,2)
  273. # 是否为存款类项目
  274. deposit_project = is_deposit_project(title, codeName[0]['name'], requirement_text)
  275. start_time = time.time() #正则角色提取
  276. predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName, channel_dic, all_winner=is_all_winner(title), req_scope=requirement_scope, deposit_project=deposit_project)
  277. cost_time["rule"] = round(time.time()-start_time,2)
  278. '''正则补充最后一句实体日期格式为招标或代理 2021/12/30;正则最后补充角色及去掉包含 公共资源交易中心 的招标人'''
  279. start_time = time.time() #正则角色提取
  280. predictor.getPredictor("roleRuleFinal").predict(list_articles,list_sentences,list_entitys, codeName)
  281. cost_time["roleRuleFinal"] = round(time.time()-start_time,2)
  282. start_time = time.time() #正则招标人召回
  283. predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName)
  284. cost_time["tendereeRuleRecall"] = round(time.time()-start_time,2)
  285. '''规则调整角色概率'''
  286. start_time = time.time() #
  287. predictor.getPredictor("rolegrade").predict(list_sentences,list_entitys,original_docchannel)
  288. cost_time["rolegrade"] = round(time.time()-start_time,2)
  289. '''规则调整金额概率'''
  290. start_time = time.time() #
  291. predictor.getPredictor("moneygrade").predict(list_sentences,list_entitys)
  292. cost_time["moneygrade"] = round(time.time()-start_time,2)
  293. start_time = time.time() #联系人模型提取
  294. predictor.getPredictor("epc").predict(list_sentences,list_entitys)
  295. log("get epc done of doc_id%s"%(doc_id))
  296. cost_time["person"] = round(time.time()-start_time,2)
  297. start_time = time.time() # 时间类别提取
  298. predictor.getPredictor("time").predict(list_sentences, list_entitys)
  299. log("get time done of doc_id%s"%(doc_id))
  300. cost_time["time"] = round(time.time()-start_time,2)
  301. start_time = time.time() # 保证金支付方式
  302. payment_way_dic = predictor.getPredictor("deposit_payment_way").predict(content=list_articles[0].content)
  303. cost_time["deposit"] = round(time.time()-start_time,2)
  304. # 需在getPredictor("prem").predict后 getAttributes.getPREMs 前 规则调整 监理|施工|设计|勘察类别公告的费用 为招标或中标金额
  305. predictor.getPredictor("prem").correct_money_by_rule(title, list_entitys, list_articles)
  306. # 2021-12-29新增:提取:总价,单价
  307. start_time = time.time() # 总价单价提取
  308. predictor.getPredictor("total_unit_money").predict(list_sentences, list_entitys)
  309. cost_time["total_unit_money"] = round(time.time()-start_time, 2)
  310. # 依赖句子顺序
  311. start_time = time.time() # 实体链接
  312. entityLink.link_entitys(list_entitys)
  313. doctitle_refine = entityLink.doctitle_refine(title)
  314. nlp_enterprise,nlp_enterprise_attachment, dict_enterprise = entityLink.get_nlp_enterprise(list_entitys[0])
  315. prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time)
  316. log("get attributes done of doc_id%s"%(doc_id))
  317. cost_time["attrs"] = round(time.time()-start_time,2)
  318. if original_docchannel != 302: # 审批项目不做下面提取
  319. '''表格要素提取'''
  320. table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise+nlp_enterprise_attachment, web_source_name, is_all_winner(title))
  321. # print('表格提取中标人:', table_prem)
  322. # print('原提取角色:', prem[0]['prem'])
  323. if table_prem:
  324. getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem, in_attachment=in_attachment)
  325. '''候选人提取'''
  326. candidate_top3_prem, candidate_dic, in_attachment = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise+nlp_enterprise_attachment)
  327. # print('表格提取候选人:', candidate_top3_prem)
  328. getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem, in_attachment=in_attachment)
  329. '''获取联合体信息'''
  330. getAttributes.get_win_joint(prem, list_entitys, list_sentences, list_articles)
  331. '''修正采购公告表格形式多种采购产品中标价格;中标金额小于所有产品总金额则改为总金额'''
  332. getAttributes.correct_rolemoney(prem, total_product_money, list_articles)
  333. '''修正channel预测类别为招标公告却有中标人及预测为中标信息却无中标关键词的类别''' # 依赖 prem
  334. start_time = time.time()
  335. # content = list_articles[0].content
  336. # channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
  337. if original_docchannel == 302:
  338. channel_dic = {"docchannel":
  339. { "docchannel": "审批项目", "doctype": "审批项目", "life_docchannel": "审批项目" }
  340. }
  341. prem[0]['prem'] = {} # 审批项目不要这项
  342. else:
  343. channel_dic, msc = predictor.getPredictor("channel").final_change(channel_dic, prem[0], original_docchannel, msc)
  344. # print('msc', msc)
  345. cost_time["rule_channel2"] = round(time.time()-start_time,2)
  346. '''一包多中标人提取及所有金额提取'''
  347. all_moneys = getAttributes.get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, is_all_winner(title))
  348. start_time = time.time() # 产品名称及废标原因提取 #依赖 docchannel结果
  349. fail = channel_dic['docchannel']['docchannel'] == "废标公告"
  350. fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail,out_lines=out_lines) #只返回失败原因,产品已加入到Entity类 #2022/7/29补充返回产品,方便行业分类调用
  351. # predictor.getPredictor("product").predict(list_sentences, list_entitys)
  352. log("get product done of doc_id%s"%(doc_id))
  353. cost_time["product"] = round(time.time()-start_time,2)
  354. prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time,prem,channel_dic))
  355. '''更新单一来源招标公告中标角色为预中标'''
  356. getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel)
  357. '''公告无表格格式时,采购意向预测''' #依赖 docchannel结果 依赖产品及prem
  358. '''把产品要素提取结果在项目名称的添加到 采购需求,预算时间,采购时间 要素中'''
  359. predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time)
  360. '''行业分类提取,需要用标题、项目名称、产品、及prem 里面的角色'''
  361. industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem)
  362. '''地区获取'''
  363. start_time = time.time()
  364. district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
  365. cost_time["district"] = round(time.time() - start_time, 2)
  366. '''根据district提取结果修复实体'''
  367. repair_entity(prem,district,list_articles)
  368. '''根据数据源最后召回招标人角色'''
  369. prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(web_source_no, web_source_name, prem)
  370. '''根据关键词表生成项目标签'''
  371. project_label = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem)
  372. # 额外需求的标签
  373. project_label = predictor.getPredictor('project_label').predict_other(project_label,industry,title,codeName[0]['name'],','.join(product_list),list_articles)
  374. # print(project_label)
  375. '''产权分类二级标签'''
  376. property_label = predictor.getPredictor('property_label').predict(title, product=','.join(product_list),project_name=codeName[0]['name'], prem=prem,channel_dic=channel_dic)
  377. '''最终验证prem'''
  378. getAttributes.confirm_prem(prem[0]['prem'], channel_dic)
  379. # 提取拟在建所需字段
  380. start_time = time.time()
  381. pb_json = predictor.getPredictor('pb_extract').predict(prem, list_articles, list_sentences, list_entitys, title, codeName[0], text)
  382. log("pb_extract done of doc_id%s"%(doc_id))
  383. cost_time["pb_extract"] = round(time.time() - start_time, 2)
  384. '''打标签'''
  385. label_dic = get_all_label(title, list_articles[0].content)
  386. # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
  387. # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
  388. version_date = {'version_date': '2024-11-07'}
  389. data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
  390. if original_docchannel == 302:
  391. approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys, text)
  392. data_res['approval'] = approval
  393. if channel_dic['docchannel']['doctype'] == '处罚公告': # 20240627 处罚公告进行失信要素提取
  394. start_time = time.time() #失信数据要素提取
  395. punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
  396. cost_time["punish"] = round(time.time()-start_time,2)
  397. data_res['punish'] = punish_dic
  398. if "Project" in data_res['prem']:
  399. for d in data_res['prem']['Project']['roleList']:
  400. if d['role_name'] == 'tenderee' and d.get('role_prob', 0.6) < 0.6: # 处罚公告 去掉低概率招标人
  401. data_res['prem']['Project']['roleList'] = [d for d in data_res['prem']['Project']['roleList'] if d['role_name'] != 'tenderee']
  402. break
  403. if len(data_res['prem']['Project']['roleList']) == 0 and data_res['prem']['Project'].get('tendereeMoney', 0) in [0, '0']: # 删除空包
  404. data_res['prem'].pop('Project')
  405. # 把产品属性里面的产品补充到产品列表
  406. for d in data_res['product_attrs']['data']:
  407. if isinstance(d['product'], str) and d['product'] not in data_res['product']:
  408. data_res['product'].append(d['product'])
  409. '''最终检查修正招标、中标金额'''
  410. getAttributes.limit_maximum_amount(data_res, list_entitys[0])
  411. data_res["project_label"] = project_label
  412. data_res["property_label"] = property_label
  413. data_res["doctitle_refine"] = doctitle_refine
  414. data_res["nlp_enterprise"] = nlp_enterprise
  415. data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
  416. data_res["dict_enterprise"] = dict_enterprise
  417. # 要素的个数
  418. data_res['extract_count'] = extractCount(data_res,page_attachments,web_source_name)
  419. # 是否有表格
  420. data_res['exist_table'] = 1 if re.search("<td",text) else 0
  421. data_res["cost_time"] = cost_time
  422. data_res["success"] = True
  423. # 拟在建需建索引字段
  424. data_res["proportion"] = pb_json.get('pb').get('proportion', '')
  425. data_res["pb_project_name"] = pb_json.get('pb').get('project_name_refind', '')
  426. # 资质要求
  427. data_res['aptitude'] = aptitude_text[:1500]
  428. # 采购内容
  429. data_res['requirement'] = requirement_text[:1500]
  430. # 打标签
  431. data_res['label_dic'] = label_dic
  432. # 开标地点
  433. data_res['addr_dic'] = {'addr_bidopen': addr_bidopen_text}
  434. # 投标地址
  435. data_res['addr_dic']['addr_bidsend'] = addr_bidsend_text
  436. # 字数
  437. text_main, text_attn = 0, 0
  438. for sentence in list_sentences[0]:
  439. if sentence.in_attachment:
  440. text_attn += len(sentence.sentence_text)
  441. else:
  442. text_main += len(sentence.sentence_text)
  443. data_res['word_count'] = {'正文': text_main, '附件': text_attn}
  444. # 限制产品数量
  445. data_res['product'] = data_res['product'][:500]
  446. data_res['product_attrs']['data'] = data_res['product_attrs']['data'][:500]
  447. # 是否为存款项目
  448. data_res['is_deposit_project'] = deposit_project
  449. # for _article in list_articles:
  450. # log(_article.content)
  451. #
  452. # for list_entity in list_entitys:
  453. # for _entity in list_entity:
  454. # log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
  455. # (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
  456. # str(_entity.begin_index),str(_entity.end_index)))
  457. _extract_json = json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
  458. _extract_json = _extract_json.replace("\x06", "").replace("\x05", "").replace("\x07", "")
  459. return _extract_json#, list_articles[0].content, get_ent_context(list_sentences, list_entitys)
  460. def test1(name,content):
  461. user = {
  462. "content": content,
  463. "id":name
  464. }
  465. myheaders = {'Content-Type': 'application/json'}
  466. _resp = requests.post("http://192.168.2.102:15030" + '/article_extract', json=user, headers=myheaders, verify=True)
  467. resp_json = _resp.content.decode("utf-8")
  468. # print(resp_json)
  469. return resp_json
  470. def get_ent_context(list_sentences, list_entitys):
  471. rs_list = []
  472. sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
  473. for list_entity in list_entitys:
  474. for _entity in list_entity:
  475. if _entity.entity_type in ['org', 'company', 'money']:
  476. s = sentences[_entity.sentence_index].sentence_text
  477. b = _entity.wordOffset_begin
  478. e = _entity.wordOffset_end
  479. # print("%s %d %.4f; %s %s %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
  480. rs_list.append("%s %d %.4f; %s ## %s ## %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
  481. return '\n'.join(rs_list)
  482. def get_role_context(docid, list_sentences, list_entitys):
  483. rs_list = []
  484. sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
  485. for list_entity in list_entitys:
  486. for _entity in list_entity:
  487. if _entity.entity_type in ['org', 'company']:
  488. idx = _entity.entity_id
  489. sentence = sentences[_entity.sentence_index]
  490. # _span = spanWindow(tokens=sentence.tokens, begin_index=_entity.begin_index, end_index=_entity.end_index, size=20,
  491. # center_include=False, word_flag=True, text=_entity.entity_text)
  492. _span = get_context(sentence.sentence_text, _entity.wordOffset_begin, _entity.wordOffset_end, size=40, center_include=False)
  493. rs_list.append((docid,idx, _entity.entity_type, _entity.label, '%.4f'%_entity.values[_entity.label], _span[0],
  494. _entity.entity_text, _span[1]))
  495. return rs_list
  496. if __name__=="__main__":
  497. import pandas as pd
  498. t1 = time.time()
  499. # text = '中标人:广州中医药有限公司,招标人:广州市第一人民医院, 代理机构:希达招标代理有限公司。招标金额:100万元, 手续费:100元,总投资:1亿元。中标金额:50000元。合同金额:50000万元。'
  500. title = '打印机'
  501. # df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx')
  502. # # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx')
  503. # for i in range(30,50,1):
  504. # text = df.loc[i, 'dochtmlcon']
  505. # rs = json.loads(predict('', text, ''))
  506. # print(rs['demand_info'])
  507. # print(rs['product'])
  508. # print(rs['product_attrs'])
  509. # print(rs)
  510. # df2 = pd.read_csv('E:/导出数据/存款入围框架采购等公告_输入要素.csv')
  511. # df = pd.read_csv('E:\导出数据/存款入围框架采购等公告_预测结果0830.csv')
  512. # df1 = pd.read_csv('E:\导出数据/存款入围框架采购等公告_html.csv')
  513. # df = df.merge(df1, on='docid', how='left')
  514. # print(len(df), df.columns)
  515. # df['rs'] = df['extract_json1'].apply(lambda x: json.loads(x))
  516. # docids = []
  517. # n1 = n2 = 0
  518. # for docid, d, html in zip(df['docid'], df['rs'], df['dochtmlcon']):
  519. # if d['docchannel']['docchannel'] == '招标公告' and '元' in html and 'Project' in d['prem'] and float(
  520. # d['prem']['Project']['tendereeMoney']) == 0:
  521. # docids.append(docid)
  522. # n1 += 1
  523. # else:
  524. # n2 += 1
  525. # print(n1, n2)
  526. # df = df[df['docid'].isin(docids)]
  527. #
  528. # df = df.merge(df2, on='docid', how='left')
  529. # df.fillna('', inplace=True)
  530. # df = df[['docid', 'doctitle', 'page_time', 'web_source_no', 'web_source_name', 'original_docchannel', 'dochtmlcon']]
  531. # print(df.columns)
  532. # # df = df[:10]
  533. # print(len(df))
  534. #
  535. # l = []
  536. # for docid, text, title, page_time,web_no, web_name, channel in zip(df['docid'], df['dochtmlcon'], df['doctitle'], df['page_time'],
  537. # df['web_source_no'], df['web_source_name'], df['original_docchannel']):
  538. # rs, content, roles = predict('', text, title, page_time, web_no, web_name, channel)
  539. # l.append((docid, rs, content, roles))
  540. # df = pd.DataFrame(l, columns=['docid', 'rs', 'content', 'roles'])
  541. # df.to_csv('E:\导出数据/存款入围框架采购等公告_招标公告无招标金额预测结果.csv')
  542. df = pd.read_csv('E:\角色金额数据/银行类招标金额缺失公告_输入要素.csv')
  543. # df2 = pd.read_csv('E:/角色金额数据/银行缺招标金额公告_3_html.csv')
  544. df2 = pd.read_csv('E:/角色金额数据/银行缺招标金额公告_012_html.csv')
  545. print(len(df), len(df2))
  546. df = df[df['docid'].isin(df2['docid'])]
  547. print(len(df))
  548. df = df.merge(df2, how='left', on='docid')
  549. print(len(df))
  550. # df1 = pd.read_excel('E:\角色金额数据/银行缺招标金额公告_检查汇总2.xlsx')
  551. # # df1 = pd.read_excel('E:\角色金额数据/银行缺招标金额公告_检查汇总2_补充招标内容金额后1105.xlsx')
  552. # df1.fillna('', inplace=True)
  553. # df1 = df1[df1['tendereeMoney']==0]
  554. # df = df.merge(df1, on='docid', how='right')
  555. df = df[['docid', 'doctitle', 'page_time', 'web_source_no', 'web_source_name', 'original_docchannel', 'dochtmlcon']]
  556. print(df.columns)
  557. # df = df[:10]
  558. print(len(df))
  559. l = []
  560. for docid, text, title, page_time,web_no, web_name, channel in zip(df['docid'], df['dochtmlcon'], df['doctitle'], df['page_time'],
  561. df['web_source_no'], df['web_source_name'], df['original_docchannel']):
  562. rs, content, roles = predict('', text, title, page_time, web_no, web_name, channel)
  563. l.append((docid, rs, content, roles))
  564. df = pd.DataFrame(l, columns=['docid', 'rs', 'content', 'roles'])
  565. # df.to_csv('E:\角色金额数据/银行缺招标金额公告_3预测结果.csv')
  566. # df.to_csv('E:\角色金额数据/银行缺招标金额公告_012预测结果.csv')
  567. # df.to_csv('E:\角色金额数据/银行缺招标金额公告_检查汇总2_修复后预测结果.csv')
  568. df.to_csv('E:\角色金额数据/银行缺招标金额公告_检查汇总2_修复后预测结果1106.csv')
  569. # with open('2.html', 'r', encoding='utf-8') as f:
  570. # text = f.read()
  571. # t1 = time.time()
  572. # print(predict('', text, title))
  573. # t2 = time.time()
  574. # print(predict('', text, title))
  575. # t3 = time.time()
  576. # print('第一次耗时:%.4f, 第二次耗时:%.4f'%(t2-t1, t3-t2))
  577. # print(predict('',text,title))
  578. # df = pd.read_excel('E:/大网站规则识别/大网站要素提取结果2.xlsx')[:]
  579. # df = pd.read_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115_2.xlsx')[:]
  580. # new_prem = []
  581. # for i in range(len(df)):
  582. # i = 530
  583. # doc_id = df.loc[i, 'docid']
  584. # text = df.loc[i, 'html']
  585. # # title = df.loc[i, 'doctitle']
  586. # rs = predict(doc_id,text)
  587. # rs = json.loads(rs)
  588. # prem = json.dumps(rs['prem'], ensure_ascii=False)
  589. # # print(rs)
  590. # new_prem.append(prem)
  591. # print(prem)
  592. # break
  593. # df['new_prem'] = pd.Series(new_prem)
  594. # print('耗时:', time.time()-t1)
  595. # # df.to_excel('E:/大网站规则识别/大网站要素提取结果20211115.xlsx')
  596. # df.to_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115.xlsx')
  597. # # pass