''' Created on 2019年1月4日 @author: User ''' import os from bs4 import BeautifulSoup, Comment import copy import re import sys import os import codecs import requests import time from unicodedata import normalize _time1 = time.time() sys.path.append(os.path.abspath("../..")) from BiddingKG.dl.common.Utils import * import BiddingKG.dl.entityLink.entityLink as entityLink import BiddingKG.dl.interface.predictor as predictor import BiddingKG.dl.interface.Preprocessing as Preprocessing import BiddingKG.dl.interface.getAttributes as getAttributes import BiddingKG.dl.complaint.punish_predictor as punish_rule import json from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money from BiddingKG.dl.ratio.re_ratio import extract_ratio # 自定义jsonEncoder class MyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, bytes): return str(obj, encoding='utf-8') elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): return float(obj) elif isinstance(obj,str): return obj return json.JSONEncoder.default(self, obj) def extractCount(extract_dict): # time_pattern = "\d{4}\-\d{2}\-\d{2}.*" if len(extract_dict): _extract = extract_dict else: _extract = {} # print(_extract) dict_pack = _extract.get("prem",{}) extract_count = 0 list_code = _extract.get("code",[]) if len(list_code)>0: project_code = list_code[0] else: project_code = "" project_name = _extract.get("name","") bidding_budget = "" win_tenderer = "" win_bid_price = "" for _key in dict_pack.keys(): if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0: extract_count += 1 if bidding_budget=="": bidding_budget = str(float(dict_pack[_key]["tendereeMoney"])) for _role in dict_pack[_key]["roleList"]: if isinstance(_role,list): extract_count += 1 if _role[2]!='' and float(_role[2])>0: extract_count += 1 if _role[0]=="tenderee": tenderee = _role[1] if _role[0]=="win_tenderer": if win_tenderer=="": win_tenderer = _role[1] if _role[2]!='' and float(_role[2])>0: extract_count += 1 if win_bid_price=="": win_bid_price = str(float(_role[2])) if _role[0]=="agency": agency = _role[1] if isinstance(_role,dict): extract_count += 1 if "role_money" in _role: if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0: extract_count += 1 if _role.get("role_name")=="tenderee": tenderee = _role["role_text"] if _role.get("role_name")=="win_tenderer": if win_tenderer=="": win_tenderer = _role["role_text"] if "role_money" in _role: if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0: extract_count += 1 if win_bid_price=="": win_bid_price = str(float(_role["role_money"]["money"])) if _role["role_name"]=="agency": agency = _role["role_text"] if project_code!="": extract_count += 1 if project_name!="": extract_count += 1 return extract_count # 字符编码标准化 def str_normalize(text): # time1 = time.time() cn_punctuation = "¥,。:;{}!?()" text_split = re.split("([{}])+".format(cn_punctuation),text) # print(text_split) new_text = "" for s in text_split: if re.search("^[{}]+$".format(cn_punctuation),s): new_text += s else: new_text += normalize('NFKD', s) # print("str_normalize cost time %s"%str(time.time()-time1)) # print(new_text) return new_text def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',**kwargs): cost_time = dict() start_time = time.time() log("start process doc %s"%(str(doc_id))) # 字符编码标准化 text = str_normalize(text) list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True) log("get preprocessed done of doc_id%s"%(doc_id)) cost_time["preprocess"] = round(time.time()-start_time,2) cost_time.update(_cost_time) # #依赖句子顺序 # start_time = time.time() # 公告类型/生命周期提取 此处作废 换到后面预测 2022/4/29 # channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0], # web_source_no=web_source_no,original_docchannel=original_docchannel) # cost_time["channel"] = round(time.time()-start_time,2) start_time = time.time() # 项目编号、名称提取 codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys) log("get codename done of doc_id%s"%(doc_id)) cost_time["codename"] = round(time.time()-start_time,2) start_time = time.time() # 角色金额模型提取 predictor.getPredictor("prem").predict(list_sentences,list_entitys) log("get prem done of doc_id%s"%(doc_id)) cost_time["prem"] = round(time.time()-start_time,2) # start_time = time.time() # 产品名称及废标原因提取 此处作废 换到后面预测 2022/4/29 # fail = channel_dic['docchannel']['docchannel'] == "废标公告" # fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类 # # predictor.getPredictor("product").predict(list_sentences, list_entitys) # log("get product done of doc_id%s"%(doc_id)) # cost_time["product"] = round(time.time()-start_time,2) start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间 product_attrs, total_product_money = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time) log("get product attributes done of doc_id%s"%(doc_id)) cost_time["product_attrs"] = round(time.time()-start_time,2) start_time = time.time() #正则角色提取 predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName) cost_time["rule"] = round(time.time()-start_time,2) '''正则补充最后一句实体日期格式为招标或代理 2021/12/30;正则最后补充角色及去掉包含 公共资源交易中心 的招标人''' start_time = time.time() #正则角色提取 predictor.getPredictor("roleRuleFinal").predict(list_articles,list_sentences,list_entitys, codeName) cost_time["roleRuleFinal"] = round(time.time()-start_time,2) start_time = time.time() #正则招标人召回 predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName) cost_time["tendereeRuleRecall"] = round(time.time()-start_time,2) '''规则调整角色概率''' start_time = time.time() # predictor.getPredictor("rolegrade").predict(list_sentences,list_entitys) cost_time["rolegrade"] = round(time.time()-start_time,2) '''规则调整金额概率''' start_time = time.time() # predictor.getPredictor("moneygrade").predict(list_sentences,list_entitys) cost_time["moneygrade"] = round(time.time()-start_time,2) start_time = time.time() #联系人模型提取 predictor.getPredictor("epc").predict(list_sentences,list_entitys) log("get epc done of doc_id%s"%(doc_id)) cost_time["person"] = round(time.time()-start_time,2) start_time = time.time() # 时间类别提取 predictor.getPredictor("time").predict(list_sentences, list_entitys) log("get time done of doc_id%s"%(doc_id)) cost_time["time"] = round(time.time()-start_time,2) start_time = time.time() # 保证金支付方式 payment_way_dic = predictor.getPredictor("deposit_payment_way").predict(content=list_articles[0].content) cost_time["deposit"] = round(time.time()-start_time,2) # 需在getPredictor("prem").predict后 getAttributes.getPREMs 前 规则调整 监理|施工|设计|勘察类别公告的费用 为招标或中标金额 predictor.getPredictor("prem").correct_money_by_rule(title, list_entitys, list_articles) # 2021-12-29新增:提取:总价,单价 start_time = time.time() # 总价单价提取 predictor.getPredictor("total_unit_money").predict(list_sentences, list_entitys) cost_time["total_unit_money"] = round(time.time()-start_time, 2) # 依赖句子顺序 start_time = time.time() # 实体链接 entityLink.link_entitys(list_entitys) doctitle_refine = entityLink.doctitle_refine(title) nlp_enterprise,nlp_enterprise_attachment = entityLink.get_nlp_enterprise(list_entitys[0]) prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines) log("get attributes done of doc_id%s"%(doc_id)) cost_time["attrs"] = round(time.time()-start_time,2) '''表格要素提取''' table_prem = predictor.getPredictor("tableprem").predict(text, nlp_enterprise) if table_prem: getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem) '''候选人提取''' candidate_top3_prem, candidate_dic = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise) getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem) '''获取联合体信息''' getAttributes.get_win_joint(prem, list_entitys, list_sentences, list_articles) #暂时不执行 # start_time = time.time() #失信数据要素提取 # list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys) # cost_time["punish"] = round(time.time()-start_time,2) '''修正采购公告表格形式多种采购产品中标价格;中标金额小于所有产品总金额则改为总金额''' getAttributes.correct_rolemoney(prem, total_product_money, list_articles) '''修正channel预测类别为招标公告却有中标人及预测为中标信息却无中标关键词的类别''' # 依赖 prem start_time = time.time() # content = list_articles[0].content # channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem']) channel_dic, msc = predictor.getPredictor("channel").predict_merge(title,list_sentences[0], text, list_articles[0].bidway, prem[0], original_docchannel) # print('msc', msc) cost_time["rule_channel"] = round(time.time()-start_time,2) start_time = time.time() # 产品名称及废标原因提取 #依赖 docchannel结果 fail = channel_dic['docchannel']['docchannel'] == "废标公告" fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类 #2022/7/29补充返回产品,方便行业分类调用 # predictor.getPredictor("product").predict(list_sentences, list_entitys) log("get product done of doc_id%s"%(doc_id)) cost_time["product"] = round(time.time()-start_time,2) prem[0].update(getAttributes.getOtherAttributes(list_entitys[0])) '''更新单一来源招标公告中标角色为预中标''' getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel) '''公告无表格格式时,采购意向预测''' #依赖 docchannel结果 依赖产品及prem '''把产品要素提取结果在项目名称的添加到 采购需求,预算时间,采购时间 要素中''' predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,codeName,prem,text,page_time) '''行业分类提取,需要用标题、项目名称、产品、及prem 里面的角色''' industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem) '''地区获取''' start_time = time.time() district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys) cost_time["district"] = round(time.time() - start_time, 2) '''限制行业最高金额''' getAttributes.limit_maximum_amount(prem, industry) # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0] # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0] version_date = {'version_date': '2023-04-21'} data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date) data_res["doctitle_refine"] = doctitle_refine data_res["nlp_enterprise"] = nlp_enterprise data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment # 要素的个数 data_res['extract_count'] = extractCount(data_res) # 是否有表格 data_res['exist_table'] = 1 if re.search("