run_single_server.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. # -*- coding: utf-8 -*-
  2. import sys
  3. import json
  4. import re
  5. import os
  6. sys.path.append(os.path.abspath("../.."))
  7. os.environ['KERAS_BACKEND']='tensorflow'
  8. from module.Utils import log
  9. """
  10. Created on Fri Jun 1 18:03:03 2018
  11. @author: DONG
  12. """
  13. from module import extractFlow
  14. from flask import Flask, jsonify
  15. from flask import abort
  16. from flask import request
  17. import time
  18. import uuid
  19. from module.Utils import xpath2css
  20. app = Flask(__name__)
  21. app.config['JSON_AS_ASCII'] = False
  22. def transformInterface(_dict):
  23. trans_dict = {}
  24. trans_dict["status_code"] = _dict.get("status_code",500)
  25. flag = True
  26. listpage_a = _dict.get("listpage_A")
  27. listpage_date = _dict.get("listpage_Date")
  28. if listpage_a and listpage_date:
  29. if listpage_a[0]==listpage_date[0]:
  30. ruleValue = listpage_a[0]
  31. # trans_dict["listPageNode"] = {"ruleType":"xpath",
  32. # "ruleValue":ruleValue,
  33. # "ruleKey":""}
  34. trans_dict["listPageNode"] = {"ruleType": "css",
  35. "ruleValue": xpath2css(ruleValue),
  36. "ruleKey": ""}
  37. else:
  38. flag = False
  39. else:
  40. flag = False
  41. listpage_turn_before = _dict.get("listpage_turn_before")
  42. listpage_turn_after = _dict.get("listpage_turn_after")
  43. listpage_pageStep = _dict.get("listpage_pageStep")
  44. listpage_nextPage = _dict.get("listpage_nextPage")
  45. _nextPage = False
  46. if listpage_nextPage:
  47. _nextPage = True
  48. trans_dict["needGetNextPage"] = _nextPage
  49. if listpage_turn_before is not None and listpage_turn_after is not None and listpage_pageStep is not None:
  50. if listpage_pageStep>0:
  51. paramOrder = 1
  52. else:
  53. paramOrder = 0
  54. ruleType = 0
  55. ruleValue = "%sbdPageNum%s"%(listpage_turn_before,listpage_turn_after)
  56. trans_dict["nextPageRule"] = {"ruleType":ruleType,
  57. "paramOrder":paramOrder,
  58. "ruleLink":ruleValue
  59. }
  60. trans_dict["needGetNextPage"] = True
  61. else:
  62. flag = False
  63. detail_date = _dict.get("detail_date")
  64. trans_dict["needDetailTime"] = False
  65. if detail_date:
  66. # trans_dict["detailDateNode"] = {"ruleType": "xpath",
  67. # "ruleValue": detail_date
  68. # }
  69. trans_dict["detailDateNode"] = {"ruleType": "css",
  70. "ruleValue": xpath2css(detail_date)
  71. }
  72. trans_dict["needDetailTime"] = True
  73. else:
  74. flag = False
  75. detail_title = _dict.get("detail_title")
  76. trans_dict["needDetailTitle"] = False
  77. if detail_title:
  78. # trans_dict["detailTitleNode"] = {"ruleType": "xpath",
  79. # "ruleValue": detail_title
  80. # }
  81. trans_dict["detailTitleNode"] = {"ruleType": "css",
  82. "ruleValue": xpath2css(detail_title)
  83. }
  84. trans_dict["needDetailTitle"] = True
  85. else:
  86. flag = False
  87. detail_content = _dict.get("detail_content")
  88. if detail_content:
  89. # trans_dict["detailContentNode"] = {"ruleType": "xpath",
  90. # "ruleValue": detail_content
  91. # }
  92. trans_dict["detailContentNode"] = {"ruleType": "css",
  93. "ruleValue": xpath2css(detail_content)
  94. }
  95. else:
  96. flag = False
  97. detail_removeList = _dict.get("detail_removeList")
  98. if detail_removeList:
  99. trans_dict["detailRemoveNode"] = "//".join(detail_removeList)
  100. trans_dict["flag"] = flag
  101. return trans_dict
  102. @app.route('/content_extract', methods=['POST'])
  103. def text_predict():
  104. start_time = time.time()
  105. # 初始化待返回结果
  106. data = {"listpage_url": "","status_code":201}
  107. MAX_CONTENT = 150000
  108. # 确保请求符合要求
  109. if request.method == "POST":
  110. if (not request.json) or ('listpage_url' not in request.json):
  111. abort(400)
  112. else:
  113. try:
  114. k = str(uuid.uuid4())
  115. data["id"] = str(k)
  116. listpage_url = request.json["listpage_url"]
  117. log("begin to getting rule of listpage:"+str(listpage_url))
  118. if re.search("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",listpage_url) is None:
  119. data["status_code"] = 400
  120. abort(400)
  121. else:
  122. data = extractFlow.ruleExtract(listpage_url)
  123. log("done for setting result of listpage:"+str(listpage_url))
  124. data["listpage_url"] = listpage_url
  125. except Exception as e:
  126. app.logger.info(msg="error:"+str(e),extra={"chain":""})
  127. data["error_msg"] = str(e)
  128. # 以json形式返回结果
  129. log(" time from receive to send: "+str(time.time()-start_time))
  130. data = transformInterface(data)
  131. # log(str(data))
  132. _resp = jsonify(data)
  133. #log(str(data["flag"])+str(data))
  134. return _resp, 201
  135. if __name__ == '__main__':
  136. app.run(host='192.168.2.65', port=15015, threaded=True, debug=False) #15015 2.65
  137. log("ContentExtractor running")