''' Created on 2019年1月4日 @author: User ''' from bs4 import BeautifulSoup, Comment import copy import re import sys import os import codecs import requests import time _time1 = time.time() sys.path.append(os.path.abspath("../..")) from BiddingKG.dl.common.Utils import * import BiddingKG.dl.entityLink.entityLink as entityLink import BiddingKG.dl.interface.predictor as predictor import BiddingKG.dl.interface.Preprocessing as Preprocessing import BiddingKG.dl.interface.getAttributes as getAttributes import BiddingKG.dl.complaint.punish_predictor as punish_rule import json #自定义jsonEncoder class MyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, bytes): return str(obj, encoding='utf-8') elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): return float(obj) elif isinstance(obj,str): return obj return json.JSONEncoder.default(self, obj) def predict(doc_id,text,title=""): cost_time = dict() start_time = time.time() log("start process doc %s"%(str(doc_id))) list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title]],useselffool=True) log("get preprocessed done of doc_id%s"%(doc_id)) cost_time["preprocess"] = time.time()-start_time cost_time.update(_cost_time) start_time = time.time() codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys) log("get codename done of doc_id%s"%(doc_id)) cost_time["codename"] = time.time()-start_time start_time = time.time() predictor.getPredictor("prem").predict(list_sentences,list_entitys) log("get prem done of doc_id%s"%(doc_id)) cost_time["prem"] = time.time()-start_time start_time = time.time() predictor.getPredictor("product").predict(list_sentences,list_entitys) log("get product done of doc_id%s"%(doc_id)) cost_time["product"] = time.time()-start_time start_time = time.time() predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName) cost_time["rule"] = time.time()-start_time start_time = time.time() predictor.getPredictor("epc").predict(list_sentences,list_entitys) log("get epc done of doc_id%s"%(doc_id)) cost_time["person"] = time.time()-start_time start_time = time.time() predictor.getPredictor("time").predict(list_sentences, list_entitys) log("get time done of doc_id%s"%(doc_id)) cost_time["time"] = time.time()-start_time start_time = time.time() entityLink.link_entitys(list_entitys) prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles) log("get attributes done of doc_id%s"%(doc_id)) cost_time["attrs"] = time.time()-start_time start_time = time.time() list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys) cost_time["punish"] = time.time()-start_time start_time = time.time() list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0]) cost_time["channel"] = time.time()-start_time #print(prem) # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0] data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0] data_res["cost_time"] = cost_time data_res["success"] = True for _article in list_articles: log(_article.content) for list_entity in list_entitys: for _entity in list_entity: log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"% (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index), str(_entity.begin_index),str(_entity.end_index))) return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False) def test(name,content): user = { "content": content, "id":name } myheaders = {'Content-Type': 'application/json'} _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True) resp_json = _resp.content.decode("utf-8") print(resp_json) return resp_json if __name__=="__main__": import pandas as pd df = pd.read_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0812.xlsx') # for i in range(50): i = 246 doc_id = df.loc[i, 'docid'] text = df.loc[i, 'dochtmlcon'] title = df.loc[i, 'doctitle'] rs = predict(doc_id,text,title) print(rs) pass