''' Created on 2019年1月4日 @author: User ''' import os from bs4 import BeautifulSoup, Comment import copy import re import sys import os import codecs import requests import time _time1 = time.time() sys.path.append(os.path.abspath("../..")) from BiddingKG.dl.common.Utils import * import BiddingKG.dl.entityLink.entityLink as entityLink import BiddingKG.dl.interface.predictor as predictor import BiddingKG.dl.interface.Preprocessing as Preprocessing import BiddingKG.dl.interface.getAttributes as getAttributes import BiddingKG.dl.complaint.punish_predictor as punish_rule import json #自定义jsonEncoder class MyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, bytes): return str(obj, encoding='utf-8') elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): return float(obj) elif isinstance(obj,str): return obj return json.JSONEncoder.default(self, obj) def predict(doc_id,text,title="",page_time="",**kwargs): cost_time = dict() start_time = time.time() log("start process doc %s"%(str(doc_id))) list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True) log("get preprocessed done of doc_id%s"%(doc_id)) cost_time["preprocess"] = round(time.time()-start_time,2) cost_time.update(_cost_time) #依赖句子顺序 start_time = time.time() list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0]) cost_time["channel"] = round(time.time()-start_time,2) start_time = time.time() codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys) log("get codename done of doc_id%s"%(doc_id)) cost_time["codename"] = round(time.time()-start_time,2) start_time = time.time() predictor.getPredictor("prem").predict(list_sentences,list_entitys) log("get prem done of doc_id%s"%(doc_id)) cost_time["prem"] = round(time.time()-start_time,2) start_time = time.time() predictor.getPredictor("product").predict(list_sentences,list_entitys) log("get product done of doc_id%s"%(doc_id)) cost_time["product"] = round(time.time()-start_time,2) start_time = time.time() product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time) log("get product attributes done of doc_id%s"%(doc_id)) cost_time["product_attrs"] = round(time.time()-start_time,2) # start_time = time.time() # predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName) # cost_time["rule"] = round(time.time()-start_time,2) start_time = time.time() predictor.getPredictor("epc").predict(list_sentences,list_entitys) log("get epc done of doc_id%s"%(doc_id)) cost_time["person"] = round(time.time()-start_time,2) start_time = time.time() predictor.getPredictor("time").predict(list_sentences, list_entitys) log("get time done of doc_id%s"%(doc_id)) cost_time["time"] = round(time.time()-start_time,2) # 需在getPredictor("prem")后 getAttributes.getPREMs 前 if len(re.findall('监理|施工|设计|勘察', title))==1 and re.search('施工|总承包|epc|EPC',title)==None: keyword = re.search('监理|设计|勘察', title).group(0) for list_entity in list_entitys: for _entity in list_entity: # print('keyword:',keyword, '_entity.notes :',_entity.notes) if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label==2: if list_channel_dic[0]['docchannel'] == "招标公告": _entity.values[0] = 0.51 _entity.set_Money(0, _entity.values) #2021/11/18 根据公告类别把费用改为招标或中投标金额 else: _entity.values[1] = 0.51 _entity.set_Money(1, _entity.values) #依赖句子顺序 start_time = time.time() entityLink.link_entitys(list_entitys) prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles) log("get attributes done of doc_id%s"%(doc_id)) cost_time["attrs"] = round(time.time()-start_time,2) start_time = time.time() list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys) cost_time["punish"] = round(time.time()-start_time,2) if len(product_attrs[1]['demand_info']['data'])>0: for d in product_attrs[1]['demand_info']['data']: for product in set(prem[0]['product']): if product in d['project_name']: d['product'].append(product) #把产品在项目名称中的添加进需求要素中 #print(prem) # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0] # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0] data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1]) data_res["cost_time"] = cost_time data_res["success"] = True for _article in list_articles: log(_article.content) for list_entity in list_entitys: for _entity in list_entity: log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"% (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index), str(_entity.begin_index),str(_entity.end_index))) return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False) def test(name,content): user = { "content": content, "id":name } myheaders = {'Content-Type': 'application/json'} _resp = requests.post("http://192.168.2.102:15030" + '/article_extract', json=user, headers=myheaders, verify=True) resp_json = _resp.content.decode("utf-8") # print(resp_json) return resp_json if __name__=="__main__": import pandas as pd t1 = time.time() # text = '中标人:广州中医药有限公司,招标人:广州市第一人民医院, 代理机构:希达招标代理有限公司。招标金额:100万元, 手续费:100元,总投资:1亿元。中标金额:50000元。合同金额:50000万元。' title = '打印机' # df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx') # # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx') # for i in range(30,50,1): # text = df.loc[i, 'dochtmlcon'] # rs = json.loads(predict('', text, '')) # print(rs['demand_info']) # print(rs['product']) # print(rs['product_attrs']) # print(rs) with open('D:/html/138786703.html', 'r', encoding='utf-8') as f: text = f.read() print(predict('', text, title)) # print(predict('',text,title)) # df = pd.read_excel('E:/大网站规则识别/大网站要素提取结果2.xlsx')[:] # df = pd.read_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115_2.xlsx')[:] # new_prem = [] # for i in range(len(df)): # i = 530 # doc_id = df.loc[i, 'docid'] # text = df.loc[i, 'html'] # # title = df.loc[i, 'doctitle'] # rs = predict(doc_id,text) # rs = json.loads(rs) # prem = json.dumps(rs['prem'], ensure_ascii=False) # # print(rs) # new_prem.append(prem) # print(prem) # break # df['new_prem'] = pd.Series(new_prem) # print('耗时:', time.time()-t1) # # df.to_excel('E:/大网站规则识别/大网站要素提取结果20211115.xlsx') # df.to_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115.xlsx') # # pass