123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219 |
- '''
- Created on 2019年1月4日
- @author: User
- '''
- import os
- from bs4 import BeautifulSoup, Comment
- import copy
- import re
- import sys
- import os
- import codecs
- import requests
- import time
- _time1 = time.time()
- sys.path.append(os.path.abspath("../.."))
- from BiddingKG.dl.common.Utils import *
- import BiddingKG.dl.entityLink.entityLink as entityLink
- import BiddingKG.dl.interface.predictor as predictor
- import BiddingKG.dl.interface.Preprocessing as Preprocessing
- import BiddingKG.dl.interface.getAttributes as getAttributes
- import BiddingKG.dl.complaint.punish_predictor as punish_rule
- import json
- from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
- from BiddingKG.dl.ratio.re_ratio import extract_ratio
- # 自定义jsonEncoder
- class MyEncoder(json.JSONEncoder):
- def default(self, obj):
- if isinstance(obj, np.ndarray):
- return obj.tolist()
- elif isinstance(obj, bytes):
- return str(obj, encoding='utf-8')
- elif isinstance(obj, (np.float_, np.float16, np.float32,
- np.float64)):
- return float(obj)
- elif isinstance(obj,str):
- return obj
- return json.JSONEncoder.default(self, obj)
- def predict(doc_id,text,title="",page_time="",**kwargs):
- cost_time = dict()
- start_time = time.time()
- log("start process doc %s"%(str(doc_id)))
- list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
- log("get preprocessed done of doc_id%s"%(doc_id))
- cost_time["preprocess"] = round(time.time()-start_time,2)
- cost_time.update(_cost_time)
- for list_entity in list_entitys:
- for _entity in list_entity:
- log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
- (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
- str(_entity.begin_index),str(_entity.end_index)))
- #依赖句子顺序
- start_time = time.time() # 公告类型/生命周期提取
- list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
- cost_time["channel"] = round(time.time()-start_time,2)
- start_time = time.time() # 项目编号、名称提取
- codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
- log("get codename done of doc_id%s"%(doc_id))
- cost_time["codename"] = round(time.time()-start_time,2)
- start_time = time.time() # 角色金额模型提取
- predictor.getPredictor("prem").predict(list_sentences,list_entitys)
- log("get prem done of doc_id%s"%(doc_id))
- cost_time["prem"] = round(time.time()-start_time,2)
- start_time = time.time() # 产品名称及废标原因提取
- predictor.getPredictor("product").predict(list_sentences,list_entitys)
- log("get product done of doc_id%s"%(doc_id))
- cost_time["product"] = round(time.time()-start_time,2)
- start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间
- product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
- log("get product attributes done of doc_id%s"%(doc_id))
- cost_time["product_attrs"] = round(time.time()-start_time,2)
- start_time = time.time() #正则角色提取
- predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
- cost_time["rule"] = round(time.time()-start_time,2)
- '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
- start_time = time.time() #正则角色提取
- predictor.getPredictor("roleRuleFinal").predict(list_articles, list_entitys)
- cost_time["roleRuleFinal"] = round(time.time()-start_time,2)
- start_time = time.time() #联系人模型提取
- predictor.getPredictor("epc").predict(list_sentences,list_entitys)
- log("get epc done of doc_id%s"%(doc_id))
- cost_time["person"] = round(time.time()-start_time,2)
- start_time = time.time() # 时间类别提取
- predictor.getPredictor("time").predict(list_sentences, list_entitys)
- log("get time done of doc_id%s"%(doc_id))
- cost_time["time"] = round(time.time()-start_time,2)
- start_time = time.time() # 保证金支付方式
- payment_way_dic = predictor.getPredictor("deposit_payment_way").predict(content=list_articles[0].content)
- cost_time["deposit"] = round(time.time()-start_time,2)
- # 需在getPredictor("prem")后 getAttributes.getPREMs 前
- if len(re.findall('监理|施工|设计|勘察', title))==1 and re.search('施工|总承包|epc|EPC',title)==None:
- keyword = re.search('监理|设计|勘察', title).group(0)
- for list_entity in list_entitys:
- for _entity in list_entity:
- # print('keyword:',keyword, '_entity.notes :',_entity.notes)
- if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label==2:
- if list_channel_dic[0]['docchannel'] == "招标公告":
- _entity.values[0] = 0.51
- _entity.set_Money(0, _entity.values) #2021/11/18 根据公告类别把费用改为招标或中投标金额
- else:
- _entity.values[1] = 0.51
- _entity.set_Money(1, _entity.values)
- # 2021-12-29新增:提取:总价,单价
- start_time = time.time() # 总价单价提取
- predictor.getPredictor("total_unit_money").predict(list_sentences, list_entitys)
- cost_time["total_unit_money"] = round(time.time()-start_time, 2)
- # 依赖句子顺序
- start_time = time.time() # 实体链接
- entityLink.link_entitys(list_entitys)
- prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
- log("get attributes done of doc_id%s"%(doc_id))
- cost_time["attrs"] = round(time.time()-start_time,2)
- start_time = time.time() #失信数据要素提取
- list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
- cost_time["punish"] = round(time.time()-start_time,2)
- if len(product_attrs[1]['demand_info']['data'])>0:
- for d in product_attrs[1]['demand_info']['data']:
- for product in set(prem[0]['product']):
- if product in d['project_name']:
- d['product'].append(product) #把产品在项目名称中的添加进需求要素中
- # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
- # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
- data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1], **payment_way_dic)
- data_res["cost_time"] = cost_time
- data_res["success"] = True
- for _article in list_articles:
- log(_article.content)
- for list_entity in list_entitys:
- for _entity in list_entity:
- log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
- (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
- str(_entity.begin_index),str(_entity.end_index)))
- return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
- def test(name,content):
- user = {
- "content": content,
- "id":name
- }
- myheaders = {'Content-Type': 'application/json'}
- _resp = requests.post("http://192.168.2.102:15030" + '/article_extract', json=user, headers=myheaders, verify=True)
- resp_json = _resp.content.decode("utf-8")
- # print(resp_json)
- return resp_json
- if __name__=="__main__":
- import pandas as pd
- t1 = time.time()
- # text = '中标人:广州中医药有限公司,招标人:广州市第一人民医院, 代理机构:希达招标代理有限公司。招标金额:100万元, 手续费:100元,总投资:1亿元。中标金额:50000元。合同金额:50000万元。'
- title = '打印机'
- # df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx')
- # # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx')
- # for i in range(30,50,1):
- # text = df.loc[i, 'dochtmlcon']
- # rs = json.loads(predict('', text, ''))
- # print(rs['demand_info'])
- # print(rs['product'])
- # print(rs['product_attrs'])
- # print(rs)
- with open('D:/html/2.html', 'r', encoding='utf-8') as f:
- text = f.read()
- t1 = time.time()
- print(predict('', text, title))
- t2 = time.time()
- print(predict('', text, title))
- t3 = time.time()
- print('第一次耗时:%.4f, 第二次耗时:%.4f'%(t2-t1, t3-t2))
- # print(predict('',text,title))
- # df = pd.read_excel('E:/大网站规则识别/大网站要素提取结果2.xlsx')[:]
- # df = pd.read_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115_2.xlsx')[:]
- # new_prem = []
- # for i in range(len(df)):
- # i = 530
- # doc_id = df.loc[i, 'docid']
- # text = df.loc[i, 'html']
- # # title = df.loc[i, 'doctitle']
- # rs = predict(doc_id,text)
- # rs = json.loads(rs)
- # prem = json.dumps(rs['prem'], ensure_ascii=False)
- # # print(rs)
- # new_prem.append(prem)
- # print(prem)
- # break
- # df['new_prem'] = pd.Series(new_prem)
- # print('耗时:', time.time()-t1)
- # # df.to_excel('E:/大网站规则识别/大网站要素提取结果20211115.xlsx')
- # df.to_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115.xlsx')
- # # pass
|