''' Created on 2019年1月4日 @author: User ''' import os from bs4 import BeautifulSoup, Comment import copy import re import sys import os import codecs import requests import time from unicodedata import normalize _time1 = time.time() sys.path.append(os.path.abspath("../..")) from BiddingKG.dl.common.Utils import * import BiddingKG.dl.entityLink.entityLink as entityLink import BiddingKG.dl.interface.predictor as predictor import BiddingKG.dl.interface.Preprocessing as Preprocessing import BiddingKG.dl.interface.getAttributes as getAttributes import BiddingKG.dl.complaint.punish_predictor as punish_rule import json from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money from BiddingKG.dl.ratio.re_ratio import extract_ratio from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_parameters, extract_sentence_list, extract_addr from BiddingKG.dl.interface.get_label_dic import get_all_label # 自定义jsonEncoder class MyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, bytes): return str(obj, encoding='utf-8') elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): return float(obj) elif isinstance(obj,str): return obj return json.JSONEncoder.default(self, obj) def get_login_web_set(): file = os.path.join(os.path.dirname(__file__),"login_weblist.txt") list_web = [] try: if os.path.exists(file): with open(file,"r",encoding="utf8") as f: while 1: line = f.readline() if not line: break line = line.strip() if line: list_web.append(line) except Exception as e: traceback.print_exc() _set = set(list_web) log("get_login_web_set length %d"%(len(_set))) return _set set_login_web = get_login_web_set() def extractCount(extract_dict,page_attachments,web_source_name): # time_pattern = "\d{4}\-\d{2}\-\d{2}.*" if len(extract_dict): _extract = extract_dict else: _extract = {} # print(_extract) dict_pack = _extract.get("prem",{}) extract_count = 0 list_code = _extract.get("code",[]) word_count = _extract.get("word_count",{}) if word_count.get("正文",0)>500: extract_count += 3 if len(list_code)>0: project_code = list_code[0] else: project_code = "" project_name = _extract.get("name","") bidding_budget = "" win_tenderer = "" win_bid_price = "" linklist_count = 0 for _key in dict_pack.keys(): if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0: extract_count += 1 if bidding_budget=="": bidding_budget = str(float(dict_pack[_key]["tendereeMoney"])) for _role in dict_pack[_key]["roleList"]: if isinstance(_role,list): extract_count += 1 if _role[2]!='' and float(_role[2])>0: extract_count += 1 if _role[0]=="tenderee": tenderee = _role[1] if _role[0]=="win_tenderer": if _role[1] is not None and _role[1]!="": extract_count += 2 if win_tenderer=="": win_tenderer = _role[1] if _role[2]!='' and float(_role[2])>0: extract_count += 2 if win_bid_price=="": win_bid_price = str(float(_role[2])) if _role[0]=="agency": agency = _role[1] if isinstance(_role,dict): extract_count += 1 if "role_money" in _role: if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0: extract_count += 1 if _role.get("role_name")=="tenderee": tenderee = _role["role_text"] if _role.get("role_name")=="win_tenderer": if _role["role_text"] is not None and _role["role_text"]!="": extract_count += 2 if win_tenderer=="": win_tenderer = _role["role_text"] if "role_money" in _role: if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0: extract_count += 2 if win_bid_price=="": win_bid_price = str(float(_role["role_money"]["money"])) if _role["role_name"]=="agency": agency = _role["role_text"] linklist = _role.get("linklist",[]) for link in linklist: for l in link: if l!="": linklist_count += 1 extract_count += linklist_count//2 if project_code!="": extract_count += 1 if project_name!="": extract_count += 1 if page_attachments is not None and page_attachments!='': try: _attachments = json.loads(page_attachments) has_zhaobiao = False has_qingdan = False if len(_attachments)>0: for _atta in _attachments: classification = _atta.get("classification","") if str(classification)=='招标文件': has_zhaobiao = True if str(classification)=='采购清单': has_qingdan = True extract_count += 2 if has_zhaobiao: extract_count += 2 if has_qingdan: extract_count += 1 except Exception as e: traceback.print_exc() pass list_approval_dict = _extract.get("approval",[]) for _dict in list_approval_dict: for k,v in _dict.items(): if v is not None and v!='' and v!="未知": extract_count += 1 punish_dict = _extract.get("punish",{}) for k,v in punish_dict.items(): if v is not None and v!='' and v!="未知": extract_count += 1 if web_source_name in set_login_web: extract_count -= 3 return extract_count # 字符编码标准化 def str_normalize(text): # time1 = time.time() cn_punctuation = "¥,。:;{}!?()<" text_split = re.split("([{}])+".format(cn_punctuation),text) # print(text_split) new_text = "" for s in text_split: if re.search("^[{}]+$".format(cn_punctuation),s): new_text += s else: new_text += normalize('NFKD', s) # print("str_normalize cost time %s"%str(time.time()-time1)) # print(new_text) return new_text # 修复prem中地区前缀不完整实体 def repair_entity(prem,district_dict,list_articles): district_dict = district_dict['district'] province = district_dict['province'] if district_dict['province'] and district_dict['province'] not in ['未知','全国'] else "" city = district_dict['city'] if district_dict['city'] and district_dict['city']!='未知' else "" district = district_dict['district'] if district_dict['district'] and district_dict['district']!='未知' else "" content_text = list_articles[0].content autonomous_region_dict = { "新疆":"新疆维吾尔", "西藏":"西藏", "内蒙古":"内蒙古", "广西":"广西壮族", "宁夏":"宁夏回族" } for package,_prem in prem[0]['prem'].items(): for role in _prem['roleList']: if role['role_name'] in ['tenderee','agency']: role_text = role['role_text'] if re.search("^[省市县区]",role_text): if role_text[0]=='省' and role_text[:2] not in ['省道']: role['role_text'] = province + role_text elif role_text[0]=='市' and role_text[:2] not in ['市政','市场']: if district+'市' in content_text: # 县级市 role['role_text'] = district + role_text else: role['role_text'] = city + role_text elif role_text[0] in ['县','区']: role['role_text'] = district + role_text elif re.search("^自治[区州县]",role_text): if role_text[:3]=='自治区': role['role_text'] = autonomous_region_dict.get(province,"") + role_text elif role_text[:3] in ['自治县',"自治州"]: if re.search("自治[县州]?$",district): role['role_text'] = re.sub("自治[县州]?","",district) + role_text elif re.search("族$",district): role['role_text'] = district + role_text elif re.search("自治[县州]?$",city): role['role_text'] = re.sub("自治[县州]?","",city) + role_text elif re.search("族$",city): role['role_text'] = city + role_text def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs): cost_time = dict() if web_source_no == None: web_source_no = '' if web_source_name == None: web_source_name = '' start_time = time.time() log("start process doc %s"%(str(doc_id))) # 字符编码标准化 text = str_normalize(text) list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True) log("get preprocessed done of doc_id%s"%(doc_id)) cost_time["preprocess"] = round(time.time()-start_time,2) cost_time.update(_cost_time) '''大纲提取及大纲内容相关提取''' sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0]) parse_document = ParseDocument(text, True,list_obj=sentence2_list) requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope = extract_parameters(parse_document) if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="": parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach) requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope = extract_parameters(parse_document) if addr_bidopen_text == '': addr_bidopen_text = extract_addr(list_articles[0].content) # 过滤掉Redis里值为0的错误实体 # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0]) # #依赖句子顺序 # start_time = time.time() # 公告类型/生命周期提取 此处作废 换到后面预测 2022/4/29 # channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0], # web_source_no=web_source_no,original_docchannel=original_docchannel) # cost_time["channel"] = round(time.time()-start_time,2) start_time = time.time() # 项目编号、名称提取 codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys) log("get codename done of doc_id%s"%(doc_id)) cost_time["codename"] = round(time.time()-start_time,2) start_time = time.time() # 公告类别预测 channel_dic, msc = predictor.getPredictor("channel").predict_merge(title, list_sentences[0], text,original_docchannel, web_source_no) cost_time["rule_channel"] = round(time.time() - start_time, 2) start_time = time.time() # 角色金额模型提取 predictor.getPredictor("prem").predict(list_sentences,list_entitys) log("get prem done of doc_id%s"%(doc_id)) cost_time["prem"] = round(time.time()-start_time,2) # start_time = time.time() # 产品名称及废标原因提取 此处作废 换到后面预测 2022/4/29 # fail = channel_dic['docchannel']['docchannel'] == "废标公告" # fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类 # # predictor.getPredictor("product").predict(list_sentences, list_entitys) # log("get product done of doc_id%s"%(doc_id)) # cost_time["product"] = round(time.time()-start_time,2) start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间 product_attrs, total_product_money = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time) log("get product attributes done of doc_id%s"%(doc_id)) cost_time["product_attrs"] = round(time.time()-start_time,2) # 是否为存款类项目 deposit_project = is_deposit_project(title, codeName[0]['name'], requirement_text) start_time = time.time() #正则角色提取 predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName, channel_dic, all_winner=is_all_winner(title), req_scope=requirement_scope, deposit_project=deposit_project) cost_time["rule"] = round(time.time()-start_time,2) '''正则补充最后一句实体日期格式为招标或代理 2021/12/30;正则最后补充角色及去掉包含 公共资源交易中心 的招标人''' start_time = time.time() #正则角色提取 predictor.getPredictor("roleRuleFinal").predict(list_articles,list_sentences,list_entitys, codeName) cost_time["roleRuleFinal"] = round(time.time()-start_time,2) start_time = time.time() #正则招标人召回 predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName) cost_time["tendereeRuleRecall"] = round(time.time()-start_time,2) '''规则调整角色概率''' start_time = time.time() # predictor.getPredictor("rolegrade").predict(list_sentences,list_entitys,original_docchannel) cost_time["rolegrade"] = round(time.time()-start_time,2) '''规则调整金额概率''' start_time = time.time() # predictor.getPredictor("moneygrade").predict(list_sentences,list_entitys) cost_time["moneygrade"] = round(time.time()-start_time,2) start_time = time.time() #联系人模型提取 predictor.getPredictor("epc").predict(list_sentences,list_entitys) log("get epc done of doc_id%s"%(doc_id)) cost_time["person"] = round(time.time()-start_time,2) start_time = time.time() # 时间类别提取 predictor.getPredictor("time").predict(list_sentences, list_entitys) log("get time done of doc_id%s"%(doc_id)) cost_time["time"] = round(time.time()-start_time,2) start_time = time.time() # 保证金支付方式 payment_way_dic = predictor.getPredictor("deposit_payment_way").predict(content=list_articles[0].content) cost_time["deposit"] = round(time.time()-start_time,2) # 需在getPredictor("prem").predict后 getAttributes.getPREMs 前 规则调整 监理|施工|设计|勘察类别公告的费用 为招标或中标金额 predictor.getPredictor("prem").correct_money_by_rule(title, list_entitys, list_articles) # 2021-12-29新增:提取:总价,单价 start_time = time.time() # 总价单价提取 predictor.getPredictor("total_unit_money").predict(list_sentences, list_entitys) cost_time["total_unit_money"] = round(time.time()-start_time, 2) # 依赖句子顺序 start_time = time.time() # 实体链接 entityLink.link_entitys(list_entitys) doctitle_refine = entityLink.doctitle_refine(title) nlp_enterprise,nlp_enterprise_attachment, dict_enterprise = entityLink.get_nlp_enterprise(list_entitys[0]) prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time) log("get attributes done of doc_id%s"%(doc_id)) cost_time["attrs"] = round(time.time()-start_time,2) if original_docchannel != 302: # 审批项目不做下面提取 '''表格要素提取''' table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise+nlp_enterprise_attachment, web_source_name, is_all_winner(title)) # print('表格提取中标人:', table_prem) # print('原提取角色:', prem[0]['prem']) if table_prem: getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem, in_attachment=in_attachment) '''候选人提取''' candidate_top3_prem, candidate_dic, in_attachment = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise+nlp_enterprise_attachment) # print('表格提取候选人:', candidate_top3_prem) getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem, in_attachment=in_attachment) '''获取联合体信息''' getAttributes.get_win_joint(prem, list_entitys, list_sentences, list_articles) '''修正采购公告表格形式多种采购产品中标价格;中标金额小于所有产品总金额则改为总金额''' getAttributes.correct_rolemoney(prem, total_product_money, list_articles) '''修正channel预测类别为招标公告却有中标人及预测为中标信息却无中标关键词的类别''' # 依赖 prem start_time = time.time() # content = list_articles[0].content # channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem']) if original_docchannel == 302: channel_dic = {"docchannel": { "docchannel": "审批项目", "doctype": "审批项目", "life_docchannel": "审批项目" } } prem[0]['prem'] = {} # 审批项目不要这项 else: channel_dic, msc = predictor.getPredictor("channel").final_change(channel_dic, prem[0], original_docchannel, msc) # print('msc', msc) cost_time["rule_channel2"] = round(time.time()-start_time,2) '''一包多中标人提取及所有金额提取''' all_moneys = getAttributes.get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, is_all_winner(title)) start_time = time.time() # 产品名称及废标原因提取 #依赖 docchannel结果 fail = channel_dic['docchannel']['docchannel'] == "废标公告" fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail,out_lines=out_lines) #只返回失败原因,产品已加入到Entity类 #2022/7/29补充返回产品,方便行业分类调用 # predictor.getPredictor("product").predict(list_sentences, list_entitys) log("get product done of doc_id%s"%(doc_id)) cost_time["product"] = round(time.time()-start_time,2) prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time,prem,channel_dic)) '''更新单一来源招标公告中标角色为预中标''' getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel) '''公告无表格格式时,采购意向预测''' #依赖 docchannel结果 依赖产品及prem '''把产品要素提取结果在项目名称的添加到 采购需求,预算时间,采购时间 要素中''' predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time) '''行业分类提取,需要用标题、项目名称、产品、及prem 里面的角色''' industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem) '''地区获取''' start_time = time.time() district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys) cost_time["district"] = round(time.time() - start_time, 2) '''根据district提取结果修复实体''' repair_entity(prem,district,list_articles) '''根据数据源最后召回招标人角色''' prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(web_source_no, web_source_name, prem) '''根据关键词表生成项目标签''' project_label = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem) # 额外需求的标签 project_label = predictor.getPredictor('project_label').predict_other(project_label,industry,title,codeName[0]['name'],','.join(product_list),list_articles) # print(project_label) '''产权分类二级标签''' property_label = predictor.getPredictor('property_label').predict(title, product=','.join(product_list),project_name=codeName[0]['name'], prem=prem,channel_dic=channel_dic) '''最终验证prem''' getAttributes.confirm_prem(prem[0]['prem'], channel_dic) # 提取拟在建所需字段 start_time = time.time() pb_json = predictor.getPredictor('pb_extract').predict(prem, list_articles, list_sentences, list_entitys, title, codeName[0], text) log("pb_extract done of doc_id%s"%(doc_id)) cost_time["pb_extract"] = round(time.time() - start_time, 2) '''打标签''' label_dic = get_all_label(title, list_articles[0].content) # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0] # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0] version_date = {'version_date': '2024-11-07'} data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json) if original_docchannel == 302: approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys, text) data_res['approval'] = approval if channel_dic['docchannel']['doctype'] == '处罚公告': # 20240627 处罚公告进行失信要素提取 start_time = time.time() #失信数据要素提取 punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys) cost_time["punish"] = round(time.time()-start_time,2) data_res['punish'] = punish_dic if "Project" in data_res['prem']: for d in data_res['prem']['Project']['roleList']: if d['role_name'] == 'tenderee' and d.get('role_prob', 0.6) < 0.6: # 处罚公告 去掉低概率招标人 data_res['prem']['Project']['roleList'] = [d for d in data_res['prem']['Project']['roleList'] if d['role_name'] != 'tenderee'] break if len(data_res['prem']['Project']['roleList']) == 0 and data_res['prem']['Project'].get('tendereeMoney', 0) in [0, '0']: # 删除空包 data_res['prem'].pop('Project') # 把产品属性里面的产品补充到产品列表 for d in data_res['product_attrs']['data']: if isinstance(d['product'], str) and d['product'] not in data_res['product']: data_res['product'].append(d['product']) '''最终检查修正招标、中标金额''' getAttributes.limit_maximum_amount(data_res, list_entitys[0]) data_res["project_label"] = project_label data_res["property_label"] = property_label data_res["doctitle_refine"] = doctitle_refine data_res["nlp_enterprise"] = nlp_enterprise data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment data_res["dict_enterprise"] = dict_enterprise # 要素的个数 data_res['extract_count'] = extractCount(data_res,page_attachments,web_source_name) # 是否有表格 data_res['exist_table'] = 1 if re.search("