123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- # -*- coding: utf-8 -*-
- import sys
- import json
- import re
- import os
- sys.path.append(os.path.abspath("../.."))
- os.environ['KERAS_BACKEND']='tensorflow'
- from module.Utils import log
- """
- Created on Fri Jun 1 18:03:03 2018
- @author: DONG
- """
- from module import extractFlow
- from flask import Flask, jsonify
- from flask import abort
- from flask import request
- import time
- import uuid
- from module.Utils import xpath2css
- app = Flask(__name__)
- app.config['JSON_AS_ASCII'] = False
- def transformInterface(_dict):
- trans_dict = {}
- trans_dict["status_code"] = _dict.get("status_code",500)
- flag = True
- listpage_a = _dict.get("listpage_A")
- listpage_date = _dict.get("listpage_Date")
- if listpage_a and listpage_date:
- if listpage_a[0]==listpage_date[0]:
- ruleValue = listpage_a[0]
- # trans_dict["listPageNode"] = {"ruleType":"xpath",
- # "ruleValue":ruleValue,
- # "ruleKey":""}
- trans_dict["listPageNode"] = {"ruleType": "css",
- "ruleValue": xpath2css(ruleValue),
- "ruleKey": ""}
- else:
- flag = False
- else:
- flag = False
- listpage_turn_before = _dict.get("listpage_turn_before")
- listpage_turn_after = _dict.get("listpage_turn_after")
- listpage_pageStep = _dict.get("listpage_pageStep")
- listpage_nextPage = _dict.get("listpage_nextPage")
- _nextPage = False
- if listpage_nextPage:
- _nextPage = True
- trans_dict["needGetNextPage"] = _nextPage
- if listpage_turn_before is not None and listpage_turn_after is not None and listpage_pageStep is not None:
- if listpage_pageStep>0:
- paramOrder = 1
- else:
- paramOrder = 0
- ruleType = 0
- ruleValue = "%sbdPageNum%s"%(listpage_turn_before,listpage_turn_after)
- trans_dict["nextPageRule"] = {"ruleType":ruleType,
- "paramOrder":paramOrder,
- "ruleLink":ruleValue
- }
- trans_dict["needGetNextPage"] = True
- else:
- flag = False
- detail_date = _dict.get("detail_date")
- trans_dict["needDetailTime"] = False
- if detail_date:
- # trans_dict["detailDateNode"] = {"ruleType": "xpath",
- # "ruleValue": detail_date
- # }
- trans_dict["detailDateNode"] = {"ruleType": "css",
- "ruleValue": xpath2css(detail_date)
- }
- trans_dict["needDetailTime"] = True
- else:
- flag = False
- detail_title = _dict.get("detail_title")
- trans_dict["needDetailTitle"] = False
- if detail_title:
- # trans_dict["detailTitleNode"] = {"ruleType": "xpath",
- # "ruleValue": detail_title
- # }
- trans_dict["detailTitleNode"] = {"ruleType": "css",
- "ruleValue": xpath2css(detail_title)
- }
- trans_dict["needDetailTitle"] = True
- else:
- flag = False
- detail_content = _dict.get("detail_content")
- if detail_content:
- # trans_dict["detailContentNode"] = {"ruleType": "xpath",
- # "ruleValue": detail_content
- # }
- trans_dict["detailContentNode"] = {"ruleType": "css",
- "ruleValue": xpath2css(detail_content)
- }
- else:
- flag = False
- detail_removeList = _dict.get("detail_removeList")
- if detail_removeList:
- trans_dict["detailRemoveNode"] = "//".join(detail_removeList)
- trans_dict["flag"] = flag
- return trans_dict
- @app.route('/content_extract', methods=['POST'])
- def text_predict():
- start_time = time.time()
- # 初始化待返回结果
- data = {"listpage_url": "","status_code":201}
- MAX_CONTENT = 150000
- # 确保请求符合要求
- if request.method == "POST":
- if (not request.json) or ('listpage_url' not in request.json):
- abort(400)
- else:
- try:
- k = str(uuid.uuid4())
- data["id"] = str(k)
- listpage_url = request.json["listpage_url"]
- log("begin to getting rule of listpage:"+str(listpage_url))
- if re.search("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",listpage_url) is None:
- data["status_code"] = 400
- abort(400)
- else:
- data = extractFlow.ruleExtract(listpage_url)
- log("done for setting result of listpage:"+str(listpage_url))
- data["listpage_url"] = listpage_url
- except Exception as e:
- app.logger.info(msg="error:"+str(e),extra={"chain":""})
- data["error_msg"] = str(e)
-
-
- # 以json形式返回结果
- log(" time from receive to send: "+str(time.time()-start_time))
- data = transformInterface(data)
- # log(str(data))
- _resp = jsonify(data)
- #log(str(data["flag"])+str(data))
- return _resp, 201
- if __name__ == '__main__':
- app.run(host='192.168.2.65', port=15015, threaded=True, debug=False) #15015 2.65
- log("ContentExtractor running")
|