extract.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. '''
  2. Created on 2019年1月4日
  3. @author: User
  4. '''
  5. import os
  6. from bs4 import BeautifulSoup, Comment
  7. import copy
  8. import re
  9. import sys
  10. import os
  11. import codecs
  12. import requests
  13. import time
  14. from unicodedata import normalize
  15. _time1 = time.time()
  16. sys.path.append(os.path.abspath("../.."))
  17. from BiddingKG.dl.common.Utils import *
  18. import BiddingKG.dl.entityLink.entityLink as entityLink
  19. import BiddingKG.dl.interface.predictor as predictor
  20. import BiddingKG.dl.interface.Preprocessing as Preprocessing
  21. import BiddingKG.dl.interface.getAttributes as getAttributes
  22. import BiddingKG.dl.complaint.punish_predictor as punish_rule
  23. import json
  24. from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
  25. from BiddingKG.dl.ratio.re_ratio import extract_ratio
  26. from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_parameters, extract_sentence_list
  27. from BiddingKG.dl.interface.get_label_dic import get_all_label
  28. # 自定义jsonEncoder
  29. class MyEncoder(json.JSONEncoder):
  30. def default(self, obj):
  31. if isinstance(obj, np.ndarray):
  32. return obj.tolist()
  33. elif isinstance(obj, bytes):
  34. return str(obj, encoding='utf-8')
  35. elif isinstance(obj, (np.float_, np.float16, np.float32,
  36. np.float64)):
  37. return float(obj)
  38. elif isinstance(obj,str):
  39. return obj
  40. return json.JSONEncoder.default(self, obj)
  41. def get_login_web_set():
  42. file = os.path.join(os.path.dirname(__file__),"login_weblist.txt")
  43. list_web = []
  44. try:
  45. if os.path.exists(file):
  46. with open(file,"r",encoding="utf8") as f:
  47. while 1:
  48. line = f.readline()
  49. if not line:
  50. break
  51. line = line.strip()
  52. if line:
  53. list_web.append(line)
  54. except Exception as e:
  55. traceback.print_exc()
  56. _set = set(list_web)
  57. log("get_login_web_set length %d"%(len(_set)))
  58. return _set
  59. set_login_web = get_login_web_set()
  60. def extractCount(extract_dict,page_attachments,web_source_name):
  61. # time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  62. if len(extract_dict):
  63. _extract = extract_dict
  64. else:
  65. _extract = {}
  66. # print(_extract)
  67. dict_pack = _extract.get("prem",{})
  68. extract_count = 0
  69. list_code = _extract.get("code",[])
  70. if len(list_code)>0:
  71. project_code = list_code[0]
  72. else:
  73. project_code = ""
  74. project_name = _extract.get("name","")
  75. bidding_budget = ""
  76. win_tenderer = ""
  77. win_bid_price = ""
  78. linklist_count = 0
  79. for _key in dict_pack.keys():
  80. if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  81. extract_count += 1
  82. if bidding_budget=="":
  83. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  84. for _role in dict_pack[_key]["roleList"]:
  85. if isinstance(_role,list):
  86. extract_count += 1
  87. if _role[2]!='' and float(_role[2])>0:
  88. extract_count += 1
  89. if _role[0]=="tenderee":
  90. tenderee = _role[1]
  91. if _role[0]=="win_tenderer":
  92. if win_tenderer=="":
  93. win_tenderer = _role[1]
  94. if _role[2]!='' and float(_role[2])>0:
  95. extract_count += 1
  96. if win_bid_price=="":
  97. win_bid_price = str(float(_role[2]))
  98. if _role[0]=="agency":
  99. agency = _role[1]
  100. if isinstance(_role,dict):
  101. extract_count += 1
  102. if "role_money" in _role:
  103. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  104. extract_count += 1
  105. if _role.get("role_name")=="tenderee":
  106. tenderee = _role["role_text"]
  107. if _role.get("role_name")=="win_tenderer":
  108. if win_tenderer=="":
  109. win_tenderer = _role["role_text"]
  110. if "role_money" in _role:
  111. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  112. extract_count += 1
  113. if win_bid_price=="":
  114. win_bid_price = str(float(_role["role_money"]["money"]))
  115. if _role["role_name"]=="agency":
  116. agency = _role["role_text"]
  117. linklist = _role.get("linklist",[])
  118. for link in linklist:
  119. for l in link:
  120. if l!="":
  121. linklist_count += 1
  122. extract_count += linklist_count//2
  123. if project_code!="":
  124. extract_count += 1
  125. if project_name!="":
  126. extract_count += 1
  127. if page_attachments is not None and page_attachments!='':
  128. try:
  129. _attachments = json.loads(page_attachments)
  130. has_zhaobiao = False
  131. has_qingdan = False
  132. if len(_attachments)>0:
  133. for _atta in _attachments:
  134. classification = _atta.get("classification","")
  135. if str(classification)=='招标文件':
  136. has_zhaobiao = True
  137. if str(classification)=='采购清单':
  138. has_qingdan = True
  139. if has_zhaobiao:
  140. extract_count += 3
  141. if has_qingdan:
  142. extract_count += 2
  143. except Exception as e:
  144. traceback.print_exc()
  145. pass
  146. list_approval_dict = _extract.get("approval",[])
  147. for _dict in list_approval_dict:
  148. for k,v in _dict.items():
  149. if v is not None and v!='' and v!="未知":
  150. extract_count += 1
  151. punish_dict = _extract.get("punish",{})
  152. for k,v in punish_dict.items():
  153. if v is not None and v!='' and v!="未知":
  154. extract_count += 1
  155. if web_source_name in set_login_web:
  156. extract_count -= 1
  157. return extract_count
  158. # 字符编码标准化
  159. def str_normalize(text):
  160. # time1 = time.time()
  161. cn_punctuation = "¥,。:;{}!?()<"
  162. text_split = re.split("([{}])+".format(cn_punctuation),text)
  163. # print(text_split)
  164. new_text = ""
  165. for s in text_split:
  166. if re.search("^[{}]+$".format(cn_punctuation),s):
  167. new_text += s
  168. else:
  169. new_text += normalize('NFKD', s)
  170. # print("str_normalize cost time %s"%str(time.time()-time1))
  171. # print(new_text)
  172. return new_text
  173. # 修复prem中地区前缀不完整实体
  174. def repair_entity(prem,district_dict,list_articles):
  175. district_dict = district_dict['district']
  176. province = district_dict['province'] if district_dict['province'] and district_dict['province'] not in ['未知','全国'] else ""
  177. city = district_dict['city'] if district_dict['city'] and district_dict['city']!='未知' else ""
  178. district = district_dict['district'] if district_dict['district'] and district_dict['district']!='未知' else ""
  179. content_text = list_articles[0].content
  180. autonomous_region_dict = {
  181. "新疆":"新疆维吾尔",
  182. "西藏":"西藏",
  183. "内蒙古":"内蒙古",
  184. "广西":"广西壮族",
  185. "宁夏":"宁夏回族"
  186. }
  187. for package,_prem in prem[0]['prem'].items():
  188. for role in _prem['roleList']:
  189. if role['role_name'] in ['tenderee','agency']:
  190. role_text = role['role_text']
  191. if re.search("^[省市县区]",role_text):
  192. if role_text[0]=='省' and role_text[:2] not in ['省道']:
  193. role['role_text'] = province + role_text
  194. elif role_text[0]=='市' and role_text[:2] not in ['市政','市场']:
  195. if district+'市' in content_text:
  196. # 县级市
  197. role['role_text'] = district + role_text
  198. else:
  199. role['role_text'] = city + role_text
  200. elif role_text[0] in ['县','区']:
  201. role['role_text'] = district + role_text
  202. elif re.search("^自治[区州县]",role_text):
  203. if role_text[:3]=='自治区':
  204. role['role_text'] = autonomous_region_dict.get(province,"") + role_text
  205. elif role_text[:3] in ['自治县',"自治州"]:
  206. if re.search("自治[县州]?$",district):
  207. role['role_text'] = re.sub("自治[县州]?","",district) + role_text
  208. elif re.search("族$",district):
  209. role['role_text'] = district + role_text
  210. elif re.search("自治[县州]?$",city):
  211. role['role_text'] = re.sub("自治[县州]?","",city) + role_text
  212. elif re.search("族$",city):
  213. role['role_text'] = city + role_text
  214. def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
  215. cost_time = dict()
  216. start_time = time.time()
  217. log("start process doc %s"%(str(doc_id)))
  218. # 字符编码标准化
  219. text = str_normalize(text)
  220. list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
  221. log("get preprocessed done of doc_id%s"%(doc_id))
  222. cost_time["preprocess"] = round(time.time()-start_time,2)
  223. cost_time.update(_cost_time)
  224. '''大纲提取及大纲内容相关提取'''
  225. sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
  226. parse_document = ParseDocument(text, True,list_obj=sentence2_list)
  227. requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text = extract_parameters(parse_document, list_articles[0].content)
  228. if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
  229. parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
  230. requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text = extract_parameters(parse_document, list_articles[0].content)
  231. # 过滤掉Redis里值为0的错误实体
  232. # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
  233. # #依赖句子顺序
  234. # start_time = time.time() # 公告类型/生命周期提取 此处作废 换到后面预测 2022/4/29
  235. # channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0],
  236. # web_source_no=web_source_no,original_docchannel=original_docchannel)
  237. # cost_time["channel"] = round(time.time()-start_time,2)
  238. start_time = time.time() # 项目编号、名称提取
  239. codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
  240. log("get codename done of doc_id%s"%(doc_id))
  241. cost_time["codename"] = round(time.time()-start_time,2)
  242. start_time = time.time() # 角色金额模型提取
  243. predictor.getPredictor("prem").predict(list_sentences,list_entitys)
  244. log("get prem done of doc_id%s"%(doc_id))
  245. cost_time["prem"] = round(time.time()-start_time,2)
  246. # start_time = time.time() # 产品名称及废标原因提取 此处作废 换到后面预测 2022/4/29
  247. # fail = channel_dic['docchannel']['docchannel'] == "废标公告"
  248. # fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
  249. # # predictor.getPredictor("product").predict(list_sentences, list_entitys)
  250. # log("get product done of doc_id%s"%(doc_id))
  251. # cost_time["product"] = round(time.time()-start_time,2)
  252. start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间
  253. product_attrs, total_product_money = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
  254. log("get product attributes done of doc_id%s"%(doc_id))
  255. cost_time["product_attrs"] = round(time.time()-start_time,2)
  256. start_time = time.time() #正则角色提取
  257. predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
  258. cost_time["rule"] = round(time.time()-start_time,2)
  259. '''正则补充最后一句实体日期格式为招标或代理 2021/12/30;正则最后补充角色及去掉包含 公共资源交易中心 的招标人'''
  260. start_time = time.time() #正则角色提取
  261. predictor.getPredictor("roleRuleFinal").predict(list_articles,list_sentences,list_entitys, codeName)
  262. cost_time["roleRuleFinal"] = round(time.time()-start_time,2)
  263. start_time = time.time() #正则招标人召回
  264. predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName)
  265. cost_time["tendereeRuleRecall"] = round(time.time()-start_time,2)
  266. '''规则调整角色概率'''
  267. start_time = time.time() #
  268. predictor.getPredictor("rolegrade").predict(list_sentences,list_entitys,original_docchannel)
  269. cost_time["rolegrade"] = round(time.time()-start_time,2)
  270. '''规则调整金额概率'''
  271. start_time = time.time() #
  272. predictor.getPredictor("moneygrade").predict(list_sentences,list_entitys)
  273. cost_time["moneygrade"] = round(time.time()-start_time,2)
  274. start_time = time.time() #联系人模型提取
  275. predictor.getPredictor("epc").predict(list_sentences,list_entitys)
  276. log("get epc done of doc_id%s"%(doc_id))
  277. cost_time["person"] = round(time.time()-start_time,2)
  278. start_time = time.time() # 时间类别提取
  279. predictor.getPredictor("time").predict(list_sentences, list_entitys)
  280. log("get time done of doc_id%s"%(doc_id))
  281. cost_time["time"] = round(time.time()-start_time,2)
  282. start_time = time.time() # 保证金支付方式
  283. payment_way_dic = predictor.getPredictor("deposit_payment_way").predict(content=list_articles[0].content)
  284. cost_time["deposit"] = round(time.time()-start_time,2)
  285. # 需在getPredictor("prem").predict后 getAttributes.getPREMs 前 规则调整 监理|施工|设计|勘察类别公告的费用 为招标或中标金额
  286. predictor.getPredictor("prem").correct_money_by_rule(title, list_entitys, list_articles)
  287. # 2021-12-29新增:提取:总价,单价
  288. start_time = time.time() # 总价单价提取
  289. predictor.getPredictor("total_unit_money").predict(list_sentences, list_entitys)
  290. cost_time["total_unit_money"] = round(time.time()-start_time, 2)
  291. # 依赖句子顺序
  292. start_time = time.time() # 实体链接
  293. entityLink.link_entitys(list_entitys)
  294. doctitle_refine = entityLink.doctitle_refine(title)
  295. nlp_enterprise,nlp_enterprise_attachment, dict_enterprise = entityLink.get_nlp_enterprise(list_entitys[0])
  296. prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time)
  297. log("get attributes done of doc_id%s"%(doc_id))
  298. cost_time["attrs"] = round(time.time()-start_time,2)
  299. if original_docchannel != 302: # 审批项目不做下面提取
  300. '''表格要素提取'''
  301. table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise, web_source_name)
  302. # print('表格提取中标人:', table_prem)
  303. # print('原提取角色:', prem[0]['prem'])
  304. if table_prem:
  305. getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem, in_attachment=in_attachment)
  306. '''候选人提取'''
  307. candidate_top3_prem, candidate_dic, in_attachment = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise)
  308. # print('表格提取候选人:', candidate_top3_prem)
  309. getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem, in_attachment=in_attachment)
  310. '''获取联合体信息'''
  311. getAttributes.get_win_joint(prem, list_entitys, list_sentences, list_articles)
  312. '''修正采购公告表格形式多种采购产品中标价格;中标金额小于所有产品总金额则改为总金额'''
  313. getAttributes.correct_rolemoney(prem, total_product_money, list_articles)
  314. '''修正channel预测类别为招标公告却有中标人及预测为中标信息却无中标关键词的类别''' # 依赖 prem
  315. start_time = time.time()
  316. # content = list_articles[0].content
  317. # channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
  318. if original_docchannel == 302:
  319. channel_dic = {"docchannel":
  320. { "docchannel": "审批项目", "doctype": "审批项目", "life_docchannel": "审批项目" }
  321. }
  322. prem[0]['prem'] = {} # 审批项目不要这项
  323. else:
  324. channel_dic, msc = predictor.getPredictor("channel").predict_merge(title,list_sentences[0], text, list_articles[0].bidway, prem[0], original_docchannel)
  325. # print('msc', msc)
  326. cost_time["rule_channel"] = round(time.time()-start_time,2)
  327. '''一包多中标人提取及所有金额提取'''
  328. all_moneys = getAttributes.get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences)
  329. start_time = time.time() # 产品名称及废标原因提取 #依赖 docchannel结果
  330. fail = channel_dic['docchannel']['docchannel'] == "废标公告"
  331. fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类 #2022/7/29补充返回产品,方便行业分类调用
  332. # predictor.getPredictor("product").predict(list_sentences, list_entitys)
  333. log("get product done of doc_id%s"%(doc_id))
  334. cost_time["product"] = round(time.time()-start_time,2)
  335. prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time,prem,channel_dic))
  336. '''更新单一来源招标公告中标角色为预中标'''
  337. getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel)
  338. '''公告无表格格式时,采购意向预测''' #依赖 docchannel结果 依赖产品及prem
  339. '''把产品要素提取结果在项目名称的添加到 采购需求,预算时间,采购时间 要素中'''
  340. predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time)
  341. '''行业分类提取,需要用标题、项目名称、产品、及prem 里面的角色'''
  342. industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem)
  343. '''地区获取'''
  344. start_time = time.time()
  345. district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
  346. cost_time["district"] = round(time.time() - start_time, 2)
  347. '''根据district提取结果修复实体'''
  348. repair_entity(prem,district,list_articles)
  349. '''根据数据源最后召回招标人角色'''
  350. prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(web_source_no, web_source_name, prem)
  351. '''根据关键词表生成项目标签'''
  352. project_label = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem)
  353. # 额外需求的标签
  354. project_label = predictor.getPredictor('project_label').predict_other(project_label,industry,title,codeName[0]['name'],','.join(product_list),list_articles)
  355. # print(project_label)
  356. '''产权分类二级标签'''
  357. property_label = predictor.getPredictor('property_label').predict(title, product=','.join(product_list),project_name=codeName[0]['name'], prem=prem,channel_dic=channel_dic)
  358. '''最终验证prem'''
  359. getAttributes.confirm_prem(prem[0]['prem'], channel_dic)
  360. # 提取拟在建所需字段
  361. start_time = time.time()
  362. pb_json = predictor.getPredictor('pb_extract').predict(prem, list_articles, list_sentences, list_entitys, title, codeName[0], text)
  363. log("pb_extract done of doc_id%s"%(doc_id))
  364. cost_time["pb_extract"] = round(time.time() - start_time, 2)
  365. '''打标签'''
  366. label_dic = get_all_label(title, list_articles[0].content)
  367. # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
  368. # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
  369. version_date = {'version_date': '2024-07-30'}
  370. data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
  371. if original_docchannel == 302:
  372. approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys)
  373. data_res['approval'] = approval
  374. if channel_dic['docchannel']['doctype'] == '处罚公告': # 20240627 处罚公告进行失信要素提取
  375. start_time = time.time() #失信数据要素提取
  376. punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
  377. cost_time["punish"] = round(time.time()-start_time,2)
  378. data_res['punish'] = punish_dic
  379. if "Project" in data_res['prem']:
  380. for d in data_res['prem']['Project']['roleList']:
  381. if d['role_name'] == 'tenderee' and d.get('role_prob', 0.6) < 0.6: # 处罚公告 去掉低概率招标人
  382. data_res['prem']['Project']['roleList'] = [d for d in data_res['prem']['Project']['roleList'] if d['role_name'] != 'tenderee']
  383. break
  384. if len(data_res['prem']['Project']['roleList']) == 0 and data_res['prem']['Project'].get('tendereeMoney', 0) in [0, '0']: # 删除空包
  385. data_res['prem'].pop('Project')
  386. # 把产品属性里面的产品补充到产品列表
  387. for d in data_res['product_attrs']['data']:
  388. if isinstance(d['product'], str) and d['product'] not in data_res['product']:
  389. data_res['product'].append(d['product'])
  390. '''最终检查修正招标、中标金额'''
  391. getAttributes.limit_maximum_amount(data_res, list_entitys[0])
  392. data_res["project_label"] = project_label
  393. data_res["property_label"] = property_label
  394. data_res["doctitle_refine"] = doctitle_refine
  395. data_res["nlp_enterprise"] = nlp_enterprise
  396. data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
  397. data_res["dict_enterprise"] = dict_enterprise
  398. # 要素的个数
  399. data_res['extract_count'] = extractCount(data_res,page_attachments,web_source_name)
  400. # 是否有表格
  401. data_res['exist_table'] = 1 if re.search("<td",text) else 0
  402. data_res["cost_time"] = cost_time
  403. data_res["success"] = True
  404. # 拟在建需建索引字段
  405. data_res["proportion"] = pb_json.get('pb').get('proportion', '')
  406. data_res["pb_project_name"] = pb_json.get('pb').get('project_name_refind', '')
  407. # 资质要求
  408. data_res['aptitude'] = aptitude_text[:1500]
  409. # 采购内容
  410. data_res['requirement'] = requirement_text[:1500]
  411. # 打标签
  412. data_res['label_dic'] = label_dic
  413. # 开标地点
  414. data_res['addr_dic'] = {'addr_bidopen': addr_bidopen_text}
  415. # 投标地址
  416. data_res['addr_dic']['addr_bidsend'] = addr_bidsend_text
  417. # for _article in list_articles:
  418. # log(_article.content)
  419. #
  420. # for list_entity in list_entitys:
  421. # for _entity in list_entity:
  422. # log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
  423. # (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
  424. # str(_entity.begin_index),str(_entity.end_index)))
  425. _extract_json = json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
  426. _extract_json = _extract_json.replace("\x06", "").replace("\x05", "").replace("\x07", "")
  427. return _extract_json#, list_articles[0].content, get_ent_context(list_sentences, list_entitys)
  428. def test(name,content):
  429. user = {
  430. "content": content,
  431. "id":name
  432. }
  433. myheaders = {'Content-Type': 'application/json'}
  434. _resp = requests.post("http://192.168.2.102:15030" + '/article_extract', json=user, headers=myheaders, verify=True)
  435. resp_json = _resp.content.decode("utf-8")
  436. # print(resp_json)
  437. return resp_json
  438. def get_ent_context(list_sentences, list_entitys):
  439. rs_list = []
  440. sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
  441. for list_entity in list_entitys:
  442. for _entity in list_entity:
  443. if _entity.entity_type in ['org', 'company', 'money']:
  444. s = sentences[_entity.sentence_index].sentence_text
  445. b = _entity.wordOffset_begin
  446. e = _entity.wordOffset_end
  447. # print("%s %d %.4f; %s %s %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
  448. rs_list.append("%s %d %.4f; %s ## %s ## %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
  449. return '\n'.join(rs_list)
  450. def get_role_context(docid, list_sentences, list_entitys):
  451. rs_list = []
  452. sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
  453. for list_entity in list_entitys:
  454. for _entity in list_entity:
  455. if _entity.entity_type in ['org', 'company']:
  456. idx = _entity.entity_id
  457. sentence = sentences[_entity.sentence_index]
  458. # _span = spanWindow(tokens=sentence.tokens, begin_index=_entity.begin_index, end_index=_entity.end_index, size=20,
  459. # center_include=False, word_flag=True, text=_entity.entity_text)
  460. _span = get_context(sentence.sentence_text, _entity.wordOffset_begin, _entity.wordOffset_end, size=40, center_include=False)
  461. rs_list.append((docid,idx, _entity.entity_type, _entity.label, '%.4f'%_entity.values[_entity.label], _span[0],
  462. _entity.entity_text, _span[1]))
  463. return rs_list
  464. if __name__=="__main__":
  465. import pandas as pd
  466. t1 = time.time()
  467. # text = '中标人:广州中医药有限公司,招标人:广州市第一人民医院, 代理机构:希达招标代理有限公司。招标金额:100万元, 手续费:100元,总投资:1亿元。中标金额:50000元。合同金额:50000万元。'
  468. title = '打印机'
  469. # df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx')
  470. # # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx')
  471. # for i in range(30,50,1):
  472. # text = df.loc[i, 'dochtmlcon']
  473. # rs = json.loads(predict('', text, ''))
  474. # print(rs['demand_info'])
  475. # print(rs['product'])
  476. # print(rs['product_attrs'])
  477. # print(rs)
  478. with open('D:/html/2.html', 'r', encoding='utf-8') as f:
  479. text = f.read()
  480. t1 = time.time()
  481. print(predict('', text, title))
  482. t2 = time.time()
  483. print(predict('', text, title))
  484. t3 = time.time()
  485. print('第一次耗时:%.4f, 第二次耗时:%.4f'%(t2-t1, t3-t2))
  486. # print(predict('',text,title))
  487. # df = pd.read_excel('E:/大网站规则识别/大网站要素提取结果2.xlsx')[:]
  488. # df = pd.read_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115_2.xlsx')[:]
  489. # new_prem = []
  490. # for i in range(len(df)):
  491. # i = 530
  492. # doc_id = df.loc[i, 'docid']
  493. # text = df.loc[i, 'html']
  494. # # title = df.loc[i, 'doctitle']
  495. # rs = predict(doc_id,text)
  496. # rs = json.loads(rs)
  497. # prem = json.dumps(rs['prem'], ensure_ascii=False)
  498. # # print(rs)
  499. # new_prem.append(prem)
  500. # print(prem)
  501. # break
  502. # df['new_prem'] = pd.Series(new_prem)
  503. # print('耗时:', time.time()-t1)
  504. # # df.to_excel('E:/大网站规则识别/大网站要素提取结果20211115.xlsx')
  505. # df.to_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115.xlsx')
  506. # # pass