123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612 |
- '''
- Created on 2019年1月4日
- @author: User
- '''
- import os
- from bs4 import BeautifulSoup, Comment
- import copy
- import re
- import sys
- import os
- import codecs
- import requests
- import time
- from unicodedata import normalize
- _time1 = time.time()
- sys.path.append(os.path.abspath("../.."))
- from BiddingKG.dl.common.Utils import *
- import BiddingKG.dl.entityLink.entityLink as entityLink
- import BiddingKG.dl.interface.predictor as predictor
- import BiddingKG.dl.interface.Preprocessing as Preprocessing
- import BiddingKG.dl.interface.getAttributes as getAttributes
- import BiddingKG.dl.complaint.punish_predictor as punish_rule
- import json
- from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
- from BiddingKG.dl.ratio.re_ratio import extract_ratio
- from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_parameters, extract_sentence_list
- from BiddingKG.dl.interface.get_label_dic import get_all_label
- # 自定义jsonEncoder
- class MyEncoder(json.JSONEncoder):
- def default(self, obj):
- if isinstance(obj, np.ndarray):
- return obj.tolist()
- elif isinstance(obj, bytes):
- return str(obj, encoding='utf-8')
- elif isinstance(obj, (np.float_, np.float16, np.float32,
- np.float64)):
- return float(obj)
- elif isinstance(obj,str):
- return obj
- return json.JSONEncoder.default(self, obj)
- def get_login_web_set():
- file = os.path.join(os.path.dirname(__file__),"login_weblist.txt")
- list_web = []
- try:
- if os.path.exists(file):
- with open(file,"r",encoding="utf8") as f:
- while 1:
- line = f.readline()
- if not line:
- break
- line = line.strip()
- if line:
- list_web.append(line)
- except Exception as e:
- traceback.print_exc()
- _set = set(list_web)
- log("get_login_web_set length %d"%(len(_set)))
- return _set
- set_login_web = get_login_web_set()
- def extractCount(extract_dict,page_attachments,web_source_name):
- # time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
- if len(extract_dict):
- _extract = extract_dict
- else:
- _extract = {}
- # print(_extract)
- dict_pack = _extract.get("prem",{})
- extract_count = 0
- list_code = _extract.get("code",[])
- word_count = _extract.get("word_count",{})
- if word_count.get("正文",0)>500:
- extract_count += 3
- if len(list_code)>0:
- project_code = list_code[0]
- else:
- project_code = ""
- project_name = _extract.get("name","")
- bidding_budget = ""
- win_tenderer = ""
- win_bid_price = ""
- linklist_count = 0
- for _key in dict_pack.keys():
- if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
- extract_count += 1
- if bidding_budget=="":
- bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
- for _role in dict_pack[_key]["roleList"]:
- if isinstance(_role,list):
- extract_count += 1
- if _role[2]!='' and float(_role[2])>0:
- extract_count += 1
- if _role[0]=="tenderee":
- tenderee = _role[1]
- if _role[0]=="win_tenderer":
- if _role[1] is not None and _role[1]!="":
- extract_count += 2
- if win_tenderer=="":
- win_tenderer = _role[1]
- if _role[2]!='' and float(_role[2])>0:
- extract_count += 2
- if win_bid_price=="":
- win_bid_price = str(float(_role[2]))
- if _role[0]=="agency":
- agency = _role[1]
- if isinstance(_role,dict):
- extract_count += 1
- if "role_money" in _role:
- if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
- extract_count += 1
- if _role.get("role_name")=="tenderee":
- tenderee = _role["role_text"]
- if _role.get("role_name")=="win_tenderer":
- if _role["role_text"] is not None and _role["role_text"]!="":
- extract_count += 2
- if win_tenderer=="":
- win_tenderer = _role["role_text"]
- if "role_money" in _role:
- if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
- extract_count += 2
- if win_bid_price=="":
- win_bid_price = str(float(_role["role_money"]["money"]))
- if _role["role_name"]=="agency":
- agency = _role["role_text"]
- linklist = _role.get("linklist",[])
- for link in linklist:
- for l in link:
- if l!="":
- linklist_count += 1
- extract_count += linklist_count//2
- if project_code!="":
- extract_count += 1
- if project_name!="":
- extract_count += 1
- if page_attachments is not None and page_attachments!='':
- try:
- _attachments = json.loads(page_attachments)
- has_zhaobiao = False
- has_qingdan = False
- if len(_attachments)>0:
- for _atta in _attachments:
- classification = _atta.get("classification","")
- if str(classification)=='招标文件':
- has_zhaobiao = True
- if str(classification)=='采购清单':
- has_qingdan = True
- extract_count += 2
- if has_zhaobiao:
- extract_count += 2
- if has_qingdan:
- extract_count += 1
- except Exception as e:
- traceback.print_exc()
- pass
- list_approval_dict = _extract.get("approval",[])
- for _dict in list_approval_dict:
- for k,v in _dict.items():
- if v is not None and v!='' and v!="未知":
- extract_count += 1
- punish_dict = _extract.get("punish",{})
- for k,v in punish_dict.items():
- if v is not None and v!='' and v!="未知":
- extract_count += 1
- if web_source_name in set_login_web:
- extract_count -= 3
- return extract_count
- # 字符编码标准化
- def str_normalize(text):
- # time1 = time.time()
- cn_punctuation = "¥,。:;{}!?()<"
- text_split = re.split("([{}])+".format(cn_punctuation),text)
- # print(text_split)
- new_text = ""
- for s in text_split:
- if re.search("^[{}]+$".format(cn_punctuation),s):
- new_text += s
- else:
- new_text += normalize('NFKD', s)
- # print("str_normalize cost time %s"%str(time.time()-time1))
- # print(new_text)
- return new_text
- # 修复prem中地区前缀不完整实体
- def repair_entity(prem,district_dict,list_articles):
- district_dict = district_dict['district']
- province = district_dict['province'] if district_dict['province'] and district_dict['province'] not in ['未知','全国'] else ""
- city = district_dict['city'] if district_dict['city'] and district_dict['city']!='未知' else ""
- district = district_dict['district'] if district_dict['district'] and district_dict['district']!='未知' else ""
- content_text = list_articles[0].content
- autonomous_region_dict = {
- "新疆":"新疆维吾尔",
- "西藏":"西藏",
- "内蒙古":"内蒙古",
- "广西":"广西壮族",
- "宁夏":"宁夏回族"
- }
- for package,_prem in prem[0]['prem'].items():
- for role in _prem['roleList']:
- if role['role_name'] in ['tenderee','agency']:
- role_text = role['role_text']
- if re.search("^[省市县区]",role_text):
- if role_text[0]=='省' and role_text[:2] not in ['省道']:
- role['role_text'] = province + role_text
- elif role_text[0]=='市' and role_text[:2] not in ['市政','市场']:
- if district+'市' in content_text:
- # 县级市
- role['role_text'] = district + role_text
- else:
- role['role_text'] = city + role_text
- elif role_text[0] in ['县','区']:
- role['role_text'] = district + role_text
- elif re.search("^自治[区州县]",role_text):
- if role_text[:3]=='自治区':
- role['role_text'] = autonomous_region_dict.get(province,"") + role_text
- elif role_text[:3] in ['自治县',"自治州"]:
- if re.search("自治[县州]?$",district):
- role['role_text'] = re.sub("自治[县州]?","",district) + role_text
- elif re.search("族$",district):
- role['role_text'] = district + role_text
- elif re.search("自治[县州]?$",city):
- role['role_text'] = re.sub("自治[县州]?","",city) + role_text
- elif re.search("族$",city):
- role['role_text'] = city + role_text
- def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
- cost_time = dict()
- start_time = time.time()
- log("start process doc %s"%(str(doc_id)))
- # 字符编码标准化
- text = str_normalize(text)
- list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
- log("get preprocessed done of doc_id%s"%(doc_id))
- cost_time["preprocess"] = round(time.time()-start_time,2)
- cost_time.update(_cost_time)
- '''大纲提取及大纲内容相关提取'''
- sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
- parse_document = ParseDocument(text, True,list_obj=sentence2_list)
- requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines= extract_parameters(parse_document, list_articles[0].content)
- if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
- parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
- requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines = extract_parameters(parse_document, list_articles[0].content)
- # 过滤掉Redis里值为0的错误实体
- # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
- # #依赖句子顺序
- # start_time = time.time() # 公告类型/生命周期提取 此处作废 换到后面预测 2022/4/29
- # channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0],
- # web_source_no=web_source_no,original_docchannel=original_docchannel)
- # cost_time["channel"] = round(time.time()-start_time,2)
- start_time = time.time() # 项目编号、名称提取
- codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
- log("get codename done of doc_id%s"%(doc_id))
- cost_time["codename"] = round(time.time()-start_time,2)
- start_time = time.time() # 公告类别预测
- channel_dic, msc = predictor.getPredictor("channel").predict_merge(title, list_sentences[0], text,original_docchannel, web_source_no)
- cost_time["rule_channel"] = round(time.time() - start_time, 2)
- start_time = time.time() # 角色金额模型提取
- predictor.getPredictor("prem").predict(list_sentences,list_entitys)
- log("get prem done of doc_id%s"%(doc_id))
- cost_time["prem"] = round(time.time()-start_time,2)
- # start_time = time.time() # 产品名称及废标原因提取 此处作废 换到后面预测 2022/4/29
- # fail = channel_dic['docchannel']['docchannel'] == "废标公告"
- # fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
- # # predictor.getPredictor("product").predict(list_sentences, list_entitys)
- # log("get product done of doc_id%s"%(doc_id))
- # cost_time["product"] = round(time.time()-start_time,2)
- start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间
- product_attrs, total_product_money = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
- log("get product attributes done of doc_id%s"%(doc_id))
- cost_time["product_attrs"] = round(time.time()-start_time,2)
- start_time = time.time() #正则角色提取
- predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName, channel_dic, all_winner=is_all_winner(title))
- cost_time["rule"] = round(time.time()-start_time,2)
- '''正则补充最后一句实体日期格式为招标或代理 2021/12/30;正则最后补充角色及去掉包含 公共资源交易中心 的招标人'''
- start_time = time.time() #正则角色提取
- predictor.getPredictor("roleRuleFinal").predict(list_articles,list_sentences,list_entitys, codeName)
- cost_time["roleRuleFinal"] = round(time.time()-start_time,2)
- start_time = time.time() #正则招标人召回
- predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName)
- cost_time["tendereeRuleRecall"] = round(time.time()-start_time,2)
- '''规则调整角色概率'''
- start_time = time.time() #
- predictor.getPredictor("rolegrade").predict(list_sentences,list_entitys,original_docchannel)
- cost_time["rolegrade"] = round(time.time()-start_time,2)
- '''规则调整金额概率'''
- start_time = time.time() #
- predictor.getPredictor("moneygrade").predict(list_sentences,list_entitys)
- cost_time["moneygrade"] = round(time.time()-start_time,2)
- start_time = time.time() #联系人模型提取
- predictor.getPredictor("epc").predict(list_sentences,list_entitys)
- log("get epc done of doc_id%s"%(doc_id))
- cost_time["person"] = round(time.time()-start_time,2)
- start_time = time.time() # 时间类别提取
- predictor.getPredictor("time").predict(list_sentences, list_entitys)
- log("get time done of doc_id%s"%(doc_id))
- cost_time["time"] = round(time.time()-start_time,2)
- start_time = time.time() # 保证金支付方式
- payment_way_dic = predictor.getPredictor("deposit_payment_way").predict(content=list_articles[0].content)
- cost_time["deposit"] = round(time.time()-start_time,2)
- # 需在getPredictor("prem").predict后 getAttributes.getPREMs 前 规则调整 监理|施工|设计|勘察类别公告的费用 为招标或中标金额
- predictor.getPredictor("prem").correct_money_by_rule(title, list_entitys, list_articles)
- # 2021-12-29新增:提取:总价,单价
- start_time = time.time() # 总价单价提取
- predictor.getPredictor("total_unit_money").predict(list_sentences, list_entitys)
- cost_time["total_unit_money"] = round(time.time()-start_time, 2)
- # 依赖句子顺序
- start_time = time.time() # 实体链接
- entityLink.link_entitys(list_entitys)
- doctitle_refine = entityLink.doctitle_refine(title)
- nlp_enterprise,nlp_enterprise_attachment, dict_enterprise = entityLink.get_nlp_enterprise(list_entitys[0])
- prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time)
- log("get attributes done of doc_id%s"%(doc_id))
- cost_time["attrs"] = round(time.time()-start_time,2)
- if original_docchannel != 302: # 审批项目不做下面提取
- '''表格要素提取'''
- table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise+nlp_enterprise_attachment, web_source_name, is_all_winner(title))
- # print('表格提取中标人:', table_prem)
- # print('原提取角色:', prem[0]['prem'])
- if table_prem:
- getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem, in_attachment=in_attachment)
- '''候选人提取'''
- candidate_top3_prem, candidate_dic, in_attachment = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise+nlp_enterprise_attachment)
- # print('表格提取候选人:', candidate_top3_prem)
- getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem, in_attachment=in_attachment)
- '''获取联合体信息'''
- getAttributes.get_win_joint(prem, list_entitys, list_sentences, list_articles)
- '''修正采购公告表格形式多种采购产品中标价格;中标金额小于所有产品总金额则改为总金额'''
- getAttributes.correct_rolemoney(prem, total_product_money, list_articles)
- '''修正channel预测类别为招标公告却有中标人及预测为中标信息却无中标关键词的类别''' # 依赖 prem
- start_time = time.time()
- # content = list_articles[0].content
- # channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
- if original_docchannel == 302:
- channel_dic = {"docchannel":
- { "docchannel": "审批项目", "doctype": "审批项目", "life_docchannel": "审批项目" }
- }
- prem[0]['prem'] = {} # 审批项目不要这项
- else:
- channel_dic, msc = predictor.getPredictor("channel").final_change(channel_dic, prem[0], title, text, original_docchannel, msc)
- # print('msc', msc)
- cost_time["rule_channel2"] = round(time.time()-start_time,2)
- '''一包多中标人提取及所有金额提取'''
- all_moneys = getAttributes.get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, is_all_winner(title))
- start_time = time.time() # 产品名称及废标原因提取 #依赖 docchannel结果
- fail = channel_dic['docchannel']['docchannel'] == "废标公告"
- fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail,out_lines=out_lines) #只返回失败原因,产品已加入到Entity类 #2022/7/29补充返回产品,方便行业分类调用
- # predictor.getPredictor("product").predict(list_sentences, list_entitys)
- log("get product done of doc_id%s"%(doc_id))
- cost_time["product"] = round(time.time()-start_time,2)
- prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time,prem,channel_dic))
- '''更新单一来源招标公告中标角色为预中标'''
- getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel)
- '''公告无表格格式时,采购意向预测''' #依赖 docchannel结果 依赖产品及prem
- '''把产品要素提取结果在项目名称的添加到 采购需求,预算时间,采购时间 要素中'''
- predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time)
- '''行业分类提取,需要用标题、项目名称、产品、及prem 里面的角色'''
- industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem)
- '''地区获取'''
- start_time = time.time()
- district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
- cost_time["district"] = round(time.time() - start_time, 2)
- '''根据district提取结果修复实体'''
- repair_entity(prem,district,list_articles)
- '''根据数据源最后召回招标人角色'''
- prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(web_source_no, web_source_name, prem)
- '''根据关键词表生成项目标签'''
- project_label = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem)
- # 额外需求的标签
- project_label = predictor.getPredictor('project_label').predict_other(project_label,industry,title,codeName[0]['name'],','.join(product_list),list_articles)
- # print(project_label)
- '''产权分类二级标签'''
- property_label = predictor.getPredictor('property_label').predict(title, product=','.join(product_list),project_name=codeName[0]['name'], prem=prem,channel_dic=channel_dic)
- '''最终验证prem'''
- getAttributes.confirm_prem(prem[0]['prem'], channel_dic)
- # 提取拟在建所需字段
- start_time = time.time()
- pb_json = predictor.getPredictor('pb_extract').predict(prem, list_articles, list_sentences, list_entitys, title, codeName[0], text)
- log("pb_extract done of doc_id%s"%(doc_id))
- cost_time["pb_extract"] = round(time.time() - start_time, 2)
- '''打标签'''
- label_dic = get_all_label(title, list_articles[0].content)
- # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
- # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
- version_date = {'version_date': '2024-09-26'}
- data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
- if original_docchannel == 302:
- approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys, text)
- data_res['approval'] = approval
- if channel_dic['docchannel']['doctype'] == '处罚公告': # 20240627 处罚公告进行失信要素提取
- start_time = time.time() #失信数据要素提取
- punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
- cost_time["punish"] = round(time.time()-start_time,2)
- data_res['punish'] = punish_dic
- if "Project" in data_res['prem']:
- for d in data_res['prem']['Project']['roleList']:
- if d['role_name'] == 'tenderee' and d.get('role_prob', 0.6) < 0.6: # 处罚公告 去掉低概率招标人
- data_res['prem']['Project']['roleList'] = [d for d in data_res['prem']['Project']['roleList'] if d['role_name'] != 'tenderee']
- break
- if len(data_res['prem']['Project']['roleList']) == 0 and data_res['prem']['Project'].get('tendereeMoney', 0) in [0, '0']: # 删除空包
- data_res['prem'].pop('Project')
- # 把产品属性里面的产品补充到产品列表
- for d in data_res['product_attrs']['data']:
- if isinstance(d['product'], str) and d['product'] not in data_res['product']:
- data_res['product'].append(d['product'])
- '''最终检查修正招标、中标金额'''
- getAttributes.limit_maximum_amount(data_res, list_entitys[0])
- data_res["project_label"] = project_label
- data_res["property_label"] = property_label
- data_res["doctitle_refine"] = doctitle_refine
- data_res["nlp_enterprise"] = nlp_enterprise
- data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
- data_res["dict_enterprise"] = dict_enterprise
- # 要素的个数
- data_res['extract_count'] = extractCount(data_res,page_attachments,web_source_name)
- # 是否有表格
- data_res['exist_table'] = 1 if re.search("<td",text) else 0
- data_res["cost_time"] = cost_time
- data_res["success"] = True
- # 拟在建需建索引字段
- data_res["proportion"] = pb_json.get('pb').get('proportion', '')
- data_res["pb_project_name"] = pb_json.get('pb').get('project_name_refind', '')
- # 资质要求
- data_res['aptitude'] = aptitude_text[:1500]
- # 采购内容
- data_res['requirement'] = requirement_text[:1500]
- # 打标签
- data_res['label_dic'] = label_dic
- # 开标地点
- data_res['addr_dic'] = {'addr_bidopen': addr_bidopen_text}
- # 投标地址
- data_res['addr_dic']['addr_bidsend'] = addr_bidsend_text
- # 字数
- if '##attachment##' in list_articles[0].content:
- text_main, text_attn = list_articles[0].content.split('##attachment##')
- else:
- text_main = list_articles[0].content
- text_attn = ""
- data_res['word_count'] = {'正文': len(text_main), '附件': len(text_attn)}
- # 限制产品数量
- data_res['product'] = data_res['product'][:500]
- data_res['product_attrs']['data'] = data_res['product_attrs']['data'][:500]
- # for _article in list_articles:
- # log(_article.content)
- #
- # for list_entity in list_entitys:
- # for _entity in list_entity:
- # log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
- # (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
- # str(_entity.begin_index),str(_entity.end_index)))
- _extract_json = json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
- _extract_json = _extract_json.replace("\x06", "").replace("\x05", "").replace("\x07", "")
- return _extract_json#, list_articles[0].content, get_ent_context(list_sentences, list_entitys)
- def test1(name,content):
- user = {
- "content": content,
- "id":name
- }
- myheaders = {'Content-Type': 'application/json'}
- _resp = requests.post("http://192.168.2.102:15030" + '/article_extract', json=user, headers=myheaders, verify=True)
- resp_json = _resp.content.decode("utf-8")
- # print(resp_json)
- return resp_json
- def get_ent_context(list_sentences, list_entitys):
- rs_list = []
- sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
- for list_entity in list_entitys:
- for _entity in list_entity:
- if _entity.entity_type in ['org', 'company', 'money']:
- s = sentences[_entity.sentence_index].sentence_text
- b = _entity.wordOffset_begin
- e = _entity.wordOffset_end
- # print("%s %d %.4f; %s %s %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
- rs_list.append("%s %d %.4f; %s ## %s ## %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
- return '\n'.join(rs_list)
- def get_role_context(docid, list_sentences, list_entitys):
- rs_list = []
- sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
- for list_entity in list_entitys:
- for _entity in list_entity:
- if _entity.entity_type in ['org', 'company']:
- idx = _entity.entity_id
- sentence = sentences[_entity.sentence_index]
- # _span = spanWindow(tokens=sentence.tokens, begin_index=_entity.begin_index, end_index=_entity.end_index, size=20,
- # center_include=False, word_flag=True, text=_entity.entity_text)
- _span = get_context(sentence.sentence_text, _entity.wordOffset_begin, _entity.wordOffset_end, size=40, center_include=False)
- rs_list.append((docid,idx, _entity.entity_type, _entity.label, '%.4f'%_entity.values[_entity.label], _span[0],
- _entity.entity_text, _span[1]))
- return rs_list
- if __name__=="__main__":
- import pandas as pd
- t1 = time.time()
- # text = '中标人:广州中医药有限公司,招标人:广州市第一人民医院, 代理机构:希达招标代理有限公司。招标金额:100万元, 手续费:100元,总投资:1亿元。中标金额:50000元。合同金额:50000万元。'
- title = '打印机'
- # df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx')
- # # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx')
- # for i in range(30,50,1):
- # text = df.loc[i, 'dochtmlcon']
- # rs = json.loads(predict('', text, ''))
- # print(rs['demand_info'])
- # print(rs['product'])
- # print(rs['product_attrs'])
- # print(rs)
- with open('2.html', 'r', encoding='utf-8') as f:
- text = f.read()
- t1 = time.time()
- print(predict('', text, title))
- t2 = time.time()
- print(predict('', text, title))
- t3 = time.time()
- print('第一次耗时:%.4f, 第二次耗时:%.4f'%(t2-t1, t3-t2))
- # print(predict('',text,title))
- # df = pd.read_excel('E:/大网站规则识别/大网站要素提取结果2.xlsx')[:]
- # df = pd.read_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115_2.xlsx')[:]
- # new_prem = []
- # for i in range(len(df)):
- # i = 530
- # doc_id = df.loc[i, 'docid']
- # text = df.loc[i, 'html']
- # # title = df.loc[i, 'doctitle']
- # rs = predict(doc_id,text)
- # rs = json.loads(rs)
- # prem = json.dumps(rs['prem'], ensure_ascii=False)
- # # print(rs)
- # new_prem.append(prem)
- # print(prem)
- # break
- # df['new_prem'] = pd.Series(new_prem)
- # print('耗时:', time.time()-t1)
- # # df.to_excel('E:/大网站规则识别/大网站要素提取结果20211115.xlsx')
- # df.to_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115.xlsx')
- # # pass
|