run_single_server.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. # -*- coding: utf-8 -*-
  2. import sys
  3. import json
  4. import re
  5. import os
  6. os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
  7. sys.path.append(os.path.abspath("../.."))
  8. os.environ['KERAS_BACKEND']='tensorflow'
  9. from module.Utils import log
  10. """
  11. Created on Fri Jun 1 18:03:03 2018
  12. @author: DONG
  13. """
  14. from module import extractFlow
  15. from flask import Flask, jsonify
  16. from flask import abort
  17. from flask import request
  18. import time
  19. import uuid
  20. from module.Utils import xpath2css
  21. app = Flask(__name__)
  22. app.config['JSON_AS_ASCII'] = False
  23. def transformInterface(_dict):
  24. trans_dict = {}
  25. trans_dict["status_code"] = _dict.get("status_code",500)
  26. flag = True
  27. listpage_a = _dict.get("listpage_A")
  28. listpage_date = _dict.get("listpage_Date")
  29. if listpage_a and listpage_date:
  30. if listpage_a[0]==listpage_date[0]:
  31. ruleValue = listpage_a[0]
  32. # trans_dict["listPageNode"] = {"ruleType":"xpath",
  33. # "ruleValue":ruleValue,
  34. # "ruleKey":""}
  35. trans_dict["listPageNode"] = {"ruleType": "css",
  36. "ruleValue": xpath2css(ruleValue),
  37. "ruleKey": ""}
  38. else:
  39. flag = False
  40. else:
  41. flag = False
  42. listpage_turn_before = _dict.get("listpage_turn_before")
  43. listpage_turn_after = _dict.get("listpage_turn_after")
  44. listpage_pageStep = _dict.get("listpage_pageStep")
  45. listpage_nextPage = _dict.get("listpage_nextPage")
  46. _nextPage = False
  47. if listpage_nextPage:
  48. _nextPage = True
  49. ruleType = 1
  50. ruleValue = xpath2css(listpage_nextPage[0]) if listpage_nextPage[1]=='xpath' else listpage_nextPage[0]
  51. trans_dict["nextPageRule"] = {"ruleType": ruleType,
  52. "paramOrder":1,
  53. "ruleValue":ruleValue}
  54. trans_dict["needGetNextPage"] = _nextPage
  55. elif listpage_turn_before is not None and listpage_turn_after is not None and listpage_pageStep is not None:
  56. if listpage_pageStep>0:
  57. paramOrder = 1
  58. else:
  59. paramOrder = 0
  60. ruleType = 0
  61. ruleValue = "%sbdPageNum%s"%(listpage_turn_before,listpage_turn_after)
  62. trans_dict["nextPageRule"] = {"ruleType":ruleType,
  63. "paramOrder":paramOrder,
  64. "ruleLink":ruleValue
  65. }
  66. trans_dict["needGetNextPage"] = True
  67. else:
  68. flag = False
  69. trans_dict["needGetNextPage"] = _nextPage
  70. detail_date = _dict.get("detail_date")
  71. trans_dict["needDetailTime"] = False
  72. if detail_date:
  73. # trans_dict["detailDateNode"] = {"ruleType": "xpath",
  74. # "ruleValue": detail_date
  75. # }
  76. trans_dict["detailDateNode"] = {"ruleType": "css",
  77. "ruleValue": xpath2css(detail_date)
  78. }
  79. trans_dict["needDetailTime"] = True
  80. else:
  81. flag = False
  82. detail_title = _dict.get("detail_title")
  83. trans_dict["needDetailTitle"] = False
  84. if detail_title:
  85. # trans_dict["detailTitleNode"] = {"ruleType": "xpath",
  86. # "ruleValue": detail_title
  87. # }
  88. trans_dict["detailTitleNode"] = {"ruleType": "css",
  89. "ruleValue": xpath2css(detail_title)
  90. }
  91. trans_dict["needDetailTitle"] = True
  92. else:
  93. flag = False
  94. detail_content = _dict.get("detail_content")
  95. if detail_content:
  96. # trans_dict["detailContentNode"] = {"ruleType": "xpath",
  97. # "ruleValue": detail_content
  98. # }
  99. trans_dict["detailContentNode"] = {"ruleType": "css",
  100. "ruleValue": xpath2css(detail_content)
  101. }
  102. else:
  103. flag = False
  104. detail_removeList = _dict.get("detail_removeList")
  105. if detail_removeList:
  106. trans_dict["detailRemoveNode"] = "//".join(detail_removeList)
  107. trans_dict["flag"] = flag
  108. return trans_dict
  109. @app.route('/content_extract', methods=['POST'])
  110. def text_predict():
  111. start_time = time.time()
  112. # 初始化待返回结果
  113. data = {"listpage_url": "","status_code":201}
  114. MAX_CONTENT = 150000
  115. # 确保请求符合要求
  116. if request.method == "POST":
  117. if (not request.json) or ('listpage_url' not in request.json):
  118. abort(400)
  119. else:
  120. try:
  121. k = str(uuid.uuid4())
  122. data["id"] = str(k)
  123. listpage_url = request.json["listpage_url"]
  124. log("begin to getting rule of listpage:"+str(listpage_url))
  125. if re.search("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",listpage_url) is None:
  126. data["status_code"] = 400
  127. abort(400)
  128. else:
  129. data = extractFlow.ruleExtract(listpage_url)
  130. log("done for setting result of listpage:"+str(listpage_url))
  131. data["listpage_url"] = listpage_url
  132. except Exception as e:
  133. app.logger.info(msg="error:"+str(e),extra={"chain":""})
  134. data["error_msg"] = str(e)
  135. # 以json形式返回结果
  136. log(" time from receive to send: "+str(time.time()-start_time))
  137. # print('返回结果: ',data)
  138. data = transformInterface(data)
  139. # log(str(data))
  140. _resp = jsonify(data)
  141. #log(str(data["flag"])+str(data))
  142. return _resp, 201
  143. if __name__ == '__main__':
  144. app.run(host='192.168.2.102', port=15015, threaded=True, debug=False) #15015 2.65
  145. log("ContentExtractor running")