123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- # -*- coding: utf-8 -*-
- """
- Created on Fri Jun 1 18:03:03 2018
- @author: DONG
- """
- import sys
- import os
- import json
- import re
- sys.path.append(os.path.abspath("../.."))
- from flask import Flask, jsonify
- from flask import abort
- from flask import request
- import time
- import uuid
- from BiddingKG.dl.common.Utils import log
- from bs4 import BeautifulSoup, Comment
- import BiddingKG.dl.interface.predictor as predictor
- import BiddingKG.dl.interface.Preprocessing as Preprocessing
- import BiddingKG.dl.interface.getAttributes as getAttributes
- import BiddingKG.dl.entityLink.entityLink as entityLink
- import json
- import traceback
- app = Flask(__name__)
- app.config['JSON_AS_ASCII'] = False
- codeNamePredict = predictor.CodeNamePredict()
- premPredict = predictor.PREMPredict()
- epcPredict = predictor.EPCPredict()
- roleRulePredict = predictor.RoleRulePredictor()
- @app.route('/content_extract', methods=['POST'])
- def text_predict():
- start_time = time.time()
- # 初始化待返回结果
- data = {"code": [],"name":"","prem":[],"success":False}
- data = {"listpage_url": "","status_code":201}
- MAX_CONTENT = 150000
- # 确保请求符合要求
- if request.method == "POST":
- if (not request.json) or ('content' not in request.json):
- abort(400)
- else:
-
- if "doc_id" in request.json:
- _doc_id = request.json['doc_id']
- else:
- _doc_id = ""
- if "title" in request.json:
- _title = request.json["title"]
- else:
- _title = ""
- try:
- log("get request of doc_id:%s"%(_doc_id))
- k = str(uuid.uuid4())
- cost_time = dict()
- content = request.json['content']
- start_time = time.time()
- list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[k,content,"",_doc_id,_title]],useselffool=True)
- cost_time["preprocess"] = time.time()-start_time
- cost_time.update(_cost_time)
- '''
- for articles in list_articles:
- print(articles.content)
-
- '''
- start_time = time.time()
- codeName = codeNamePredict.predict(list_sentences,list_entitys)
- cost_time["codename"] = time.time()-start_time
-
- start_time = time.time()
- premPredict.predict(list_sentences,list_entitys)
- cost_time["prem"] = time.time()-start_time
- start_time = time.time()
- roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
- cost_time["rule"] = time.time()-start_time
- start_time = time.time()
- epcPredict.predict(list_sentences,list_entitys)
- cost_time["person"] = time.time()-start_time
- start_time = time.time()
- entityLink.link_entitys(list_entitys)
- '''
- for list_entity in list_entitys:
- for _entity in list_entity:
- for _ent in _entity.linked_entitys:
- print(_entity.entity_text,_ent.entity_text)
- '''
- prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
- cost_time["attrs"] = time.time()-start_time
-
-
- '''
-
-
- for entitys in list_entitys:
- for entity in entitys:
- print(entity.entity_text,entity.entity_type,entity.sentence_index,entity.begin_index,entity.label,entity.values)
- '''
- #print(prem)
- data = Preprocessing.union_result(codeName, prem)[0][1]
- data["cost_time"] = cost_time
- data["success"] = True
- #return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
-
- #except Exception as e:
- #log(str(e))
- except Exception as e:
- traceback.print_exc(file=sys.stdout)
- data["success"] = False
- # 以json形式返回结果
- _resp = jsonify(data)
- #log(str(data["flag"])+str(data))
- log("done for doc_id:%s with result:%s"%(_doc_id,str(data)))
- return _resp, 201
- if __name__ == '__main__':
- app.run(host='0.0.0.0', port=15013, threaded=True, debug=False)
- log("ContentExtractor running")
|