''' Created on 2019年1月4日 @author: User ''' import os from bs4 import BeautifulSoup, Comment import copy import re import sys import os import codecs import requests import time _time1 = time.time() sys.path.append(os.path.abspath("../..")) from BiddingKG.dl.common.Utils import * import BiddingKG.dl.entityLink.entityLink as entityLink import BiddingKG.dl.interface.predictor as predictor import BiddingKG.dl.interface.Preprocessing as Preprocessing import BiddingKG.dl.interface.getAttributes as getAttributes import BiddingKG.dl.complaint.punish_predictor as punish_rule import json from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money from BiddingKG.dl.ratio.re_ratio import extract_ratio # 自定义jsonEncoder class MyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, bytes): return str(obj, encoding='utf-8') elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): return float(obj) elif isinstance(obj,str): return obj return json.JSONEncoder.default(self, obj) def extractCount(extract_dict): # time_pattern = "\d{4}\-\d{2}\-\d{2}.*" if len(extract_dict): _extract = extract_dict else: _extract = {} print(_extract) dict_pack = _extract.get("prem",{}) extract_count = 0 list_code = _extract.get("code",[]) if len(list_code)>0: project_code = list_code[0] else: project_code = "" project_name = _extract.get("name","") bidding_budget = "" win_tenderer = "" win_bid_price = "" for _key in dict_pack.keys(): if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0: extract_count += 1 if bidding_budget=="": bidding_budget = str(float(dict_pack[_key]["tendereeMoney"])) for _role in dict_pack[_key]["roleList"]: if isinstance(_role,list): extract_count += 1 if _role[2]!='' and float(_role[2])>0: extract_count += 1 if _role[0]=="tenderee": tenderee = _role[1] if _role[0]=="win_tenderer": if win_tenderer=="": win_tenderer = _role[1] if _role[2]!='' and float(_role[2])>0: extract_count += 1 if win_bid_price=="": win_bid_price = str(float(_role[2])) if _role[0]=="agency": agency = _role[1] if isinstance(_role,dict): extract_count += 1 if "role_money" in _role: if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0: extract_count += 1 if _role.get("role_name")=="tenderee": tenderee = _role["role_text"] if _role.get("role_name")=="win_tenderer": if win_tenderer=="": win_tenderer = _role["role_text"] if "role_money" in _role: if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0: extract_count += 1 if win_bid_price=="": win_bid_price = str(float(_role["role_money"]["money"])) if _role["role_name"]=="agency": agency = _role["role_text"] if project_code!="": extract_count += 1 if project_name!="": extract_count += 1 return extract_count def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs): cost_time = dict() start_time = time.time() log("start process doc %s"%(str(doc_id))) list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True) log("get preprocessed done of doc_id%s"%(doc_id)) cost_time["preprocess"] = round(time.time()-start_time,2) cost_time.update(_cost_time) #依赖句子顺序 start_time = time.time() # 公告类型/生命周期提取 channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0], web_source_no=web_source_no) cost_time["channel"] = round(time.time()-start_time,2) start_time = time.time() # 项目编号、名称提取 codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys) log("get codename done of doc_id%s"%(doc_id)) cost_time["codename"] = round(time.time()-start_time,2) start_time = time.time() # 角色金额模型提取 predictor.getPredictor("prem").predict(list_sentences,list_entitys) log("get prem done of doc_id%s"%(doc_id)) cost_time["prem"] = round(time.time()-start_time,2) start_time = time.time() # 产品名称及废标原因提取 fail = channel_dic['docchannel']['docchannel'] == "废标公告" fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类 # predictor.getPredictor("product").predict(list_sentences, list_entitys) log("get product done of doc_id%s"%(doc_id)) cost_time["product"] = round(time.time()-start_time,2) start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间 product_attrs, total_product_money = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time) log("get product attributes done of doc_id%s"%(doc_id)) cost_time["product_attrs"] = round(time.time()-start_time,2) start_time = time.time() #正则角色提取 predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName) cost_time["rule"] = round(time.time()-start_time,2) '''正则补充最后一句实体日期格式为招标或代理 2021/12/30''' start_time = time.time() #正则角色提取 predictor.getPredictor("roleRuleFinal").predict(list_articles,list_sentences,list_entitys, codeName) cost_time["roleRuleFinal"] = round(time.time()-start_time,2) start_time = time.time() #正则招标人召回 predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName) cost_time["tendereeRuleRecall"] = round(time.time()-start_time,2) start_time = time.time() #联系人模型提取 predictor.getPredictor("epc").predict(list_sentences,list_entitys) log("get epc done of doc_id%s"%(doc_id)) cost_time["person"] = round(time.time()-start_time,2) start_time = time.time() # 时间类别提取 predictor.getPredictor("time").predict(list_sentences, list_entitys) log("get time done of doc_id%s"%(doc_id)) cost_time["time"] = round(time.time()-start_time,2) start_time = time.time() # 保证金支付方式 payment_way_dic = predictor.getPredictor("deposit_payment_way").predict(content=list_articles[0].content) cost_time["deposit"] = round(time.time()-start_time,2) # 需在getPredictor("prem")后 getAttributes.getPREMs 前 if len(re.findall('监理|施工|设计|勘察', title))==1 and re.search('施工|总承包|epc|EPC',title)==None: keyword = re.search('监理|设计|勘察', title).group(0) for list_entity in list_entitys: for _entity in list_entity: # print('keyword:',keyword, '_entity.notes :',_entity.notes) if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label==2: if channel_dic['docchannel'] == "招标公告": _entity.values[0] = 0.51 _entity.set_Money(0, _entity.values) #2021/11/18 根据公告类别把费用改为招标或中投标金额 else: _entity.values[1] = 0.51 _entity.set_Money(1, _entity.values) # 2021-12-29新增:提取:总价,单价 start_time = time.time() # 总价单价提取 predictor.getPredictor("total_unit_money").predict(list_sentences, list_entitys) cost_time["total_unit_money"] = round(time.time()-start_time, 2) # 依赖句子顺序 start_time = time.time() # 实体链接 entityLink.link_entitys(list_entitys) doctitle_refine = entityLink.doctitle_refine(title) nlp_enterprise = entityLink.get_nlp_enterprise(list_entitys[0]) prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines) log("get attributes done of doc_id%s"%(doc_id)) cost_time["attrs"] = round(time.time()-start_time,2) start_time = time.time() #失信数据要素提取 list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys) cost_time["punish"] = round(time.time()-start_time,2) if len(product_attrs[1]['demand_info']['data'])>0: for d in product_attrs[1]['demand_info']['data']: for product in set(prem[0]['product']): if product in d['project_name']: d['product'].append(product) #把产品在项目名称中的添加进需求要素中 '''修正采购公告表格形式多种采购产品中标价格''' if total_product_money>0 and len(prem[0]['prem'])==1: for value in prem[0]['prem'].values(): for l in value['roleList']: try: # if l[0] == 'win_tenderer' and float(l[2])