# -*- coding: utf-8 -*- import sys import json import re import os sys.path.append(os.path.abspath("../..")) os.environ['KERAS_BACKEND']='tensorflow' from module.Utils import log """ Created on Fri Jun 1 18:03:03 2018 @author: DONG """ from module import extractFlow from flask import Flask, jsonify from flask import abort from flask import request import time import uuid from module.Utils import xpath2css app = Flask(__name__) app.config['JSON_AS_ASCII'] = False def transformInterface(_dict): trans_dict = {} trans_dict["status_code"] = _dict.get("status_code",500) flag = True listpage_a = _dict.get("listpage_A") listpage_date = _dict.get("listpage_Date") if listpage_a and listpage_date: if listpage_a[0]==listpage_date[0]: ruleValue = listpage_a[0] # trans_dict["listPageNode"] = {"ruleType":"xpath", # "ruleValue":ruleValue, # "ruleKey":""} trans_dict["listPageNode"] = {"ruleType": "css", "ruleValue": xpath2css(ruleValue), "ruleKey": ""} else: flag = False else: flag = False listpage_turn_before = _dict.get("listpage_turn_before") listpage_turn_after = _dict.get("listpage_turn_after") listpage_pageStep = _dict.get("listpage_pageStep") listpage_nextPage = _dict.get("listpage_nextPage") _nextPage = False if listpage_nextPage: _nextPage = True trans_dict["needGetNextPage"] = _nextPage if listpage_turn_before is not None and listpage_turn_after is not None and listpage_pageStep is not None: if listpage_pageStep>0: paramOrder = 1 else: paramOrder = 0 ruleType = 0 ruleValue = "%sbdPageNum%s"%(listpage_turn_before,listpage_turn_after) trans_dict["nextPageRule"] = {"ruleType":ruleType, "paramOrder":paramOrder, "ruleLink":ruleValue } trans_dict["needGetNextPage"] = True else: flag = False detail_date = _dict.get("detail_date") trans_dict["needDetailTime"] = False if detail_date: # trans_dict["detailDateNode"] = {"ruleType": "xpath", # "ruleValue": detail_date # } trans_dict["detailDateNode"] = {"ruleType": "css", "ruleValue": xpath2css(detail_date) } trans_dict["needDetailTime"] = True else: flag = False detail_title = _dict.get("detail_title") trans_dict["needDetailTitle"] = False if detail_title: # trans_dict["detailTitleNode"] = {"ruleType": "xpath", # "ruleValue": detail_title # } trans_dict["detailTitleNode"] = {"ruleType": "css", "ruleValue": xpath2css(detail_title) } trans_dict["needDetailTitle"] = True else: flag = False detail_content = _dict.get("detail_content") if detail_content: # trans_dict["detailContentNode"] = {"ruleType": "xpath", # "ruleValue": detail_content # } trans_dict["detailContentNode"] = {"ruleType": "css", "ruleValue": xpath2css(detail_content) } else: flag = False detail_removeList = _dict.get("detail_removeList") if detail_removeList: trans_dict["detailRemoveNode"] = "//".join(detail_removeList) trans_dict["flag"] = flag return trans_dict @app.route('/content_extract', methods=['POST']) def text_predict(): start_time = time.time() # 初始化待返回结果 data = {"listpage_url": "","status_code":201} MAX_CONTENT = 150000 # 确保请求符合要求 if request.method == "POST": if (not request.json) or ('listpage_url' not in request.json): abort(400) else: try: k = str(uuid.uuid4()) data["id"] = str(k) listpage_url = request.json["listpage_url"] log("begin to getting rule of listpage:"+str(listpage_url)) if re.search("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",listpage_url) is None: data["status_code"] = 400 abort(400) else: data = extractFlow.ruleExtract(listpage_url) log("done for setting result of listpage:"+str(listpage_url)) data["listpage_url"] = listpage_url except Exception as e: app.logger.info(msg="error:"+str(e),extra={"chain":""}) data["error_msg"] = str(e) # 以json形式返回结果 log(" time from receive to send: "+str(time.time()-start_time)) data = transformInterface(data) # log(str(data)) _resp = jsonify(data) #log(str(data["flag"])+str(data)) return _resp, 201 if __name__ == '__main__': app.run(host='192.168.2.65', port=15015, threaded=True, debug=False) #15015 2.65 log("ContentExtractor running")