3.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. '''
  2. Created on 2019年1月3日
  3. @author: User
  4. '''
  5. import sys
  6. import os
  7. import json
  8. import re
  9. import pickle
  10. import requests
  11. import codecs
  12. from bs4 import BeautifulSoup
  13. import time
  14. import shutil
  15. from threading import Thread
  16. import jpype
  17. sys.path.append(os.path.abspath("../.."))
  18. def save(object_to_save, path):
  19. '''
  20. 保存对象
  21. @Arugs:
  22. object_to_save: 需要保存的对象
  23. @Return:
  24. 保存的路径
  25. '''
  26. with open(path, 'wb') as f:
  27. pickle.dump(object_to_save, f)
  28. def load(path):
  29. '''
  30. 读取对象
  31. @Arugs:
  32. path: 读取的路径
  33. @Return:
  34. 读取的对象
  35. '''
  36. with open(path, 'rb') as f:
  37. object1 = pickle.load(f)
  38. return object1
  39. def test(name,content):
  40. user = {
  41. #"content": "XXXXXXXXXXXXXXXXXXX",
  42. "content": content,
  43. "id":name,
  44. "doc_id":"1234"
  45. }
  46. myheaders = {'Content-Type': 'application/json',"Authorization": "NzZmOWZlMmU2MGY3YmQ4MDBjM2E5MDAyZjhjNjQ0MzZlMmE0NTMwZg==","appKey": "203780894","appSecret": "3rwyr0b8djsn6l3o4i8mplxe4giiy2ke"}
  47. try:
  48. #_resp = requests.post('http://pai-eas-vpc.cn-hangzhou.aliyuncs.com/api/predict/content_extract', json=user, headers=myheaders, verify=True)
  49. _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
  50. # _resp = requests.post("http://192.168.2.101:15030" + '/article_extract', json=user, headers=myheaders, verify=True)
  51. # _resp = requests.post("http://127.0.0.1:15013" + '/content_extract', json=user, headers=myheaders, verify=True)
  52. # print("http://127.0.0.1:15014")
  53. resp_json = _resp.content.decode("utf-8")
  54. print("##",_resp.status_code)
  55. print("==",resp_json)
  56. print(json.loads(resp_json))
  57. if _resp.status_code==201:
  58. print(json.loads(resp_json))
  59. return resp_json
  60. else:
  61. print(resp_json)
  62. return None
  63. except Exception as e:
  64. print(str(e))
  65. return None
  66. def getFile(filename):
  67. path = "C:\\Users\\User\\Desktop\\数据20191014\\"
  68. file = path+filename
  69. dest_dir = "C:\\Users\\User\\Desktop\\getfile"
  70. shutil.copy(file,dest_dir)
  71. ''' '''
  72. class MyThread(Thread):
  73. cost_time = dict()
  74. list_result = []
  75. num_200 = []
  76. num_other = []
  77. data = load("list_contents.pk")
  78. def run(self):
  79. for item in self.data[:100]:
  80. filename = item[0]
  81. content = item[1]
  82. result = test(filename,content)
  83. if result is not None:
  84. self.num_200.append("")
  85. self.list_result.append(result)
  86. result = json.loads(result)
  87. _time = result["cost_time"]
  88. for _key in _time.keys():
  89. if _key not in self.cost_time:
  90. self.cost_time[_key] = 0
  91. self.cost_time[_key] += _time[_key]
  92. else:
  93. self.num_other.append("")
  94. def test_highConcurrency():
  95. thread_num = 10
  96. list_thread = []
  97. cost_time = dict()
  98. _start_time = time.time()
  99. for i in range(thread_num):
  100. t = MyThread()
  101. list_thread.append(t)
  102. for t in list_thread:
  103. t.start()
  104. for t in list_thread:
  105. t.join()
  106. t = list_thread[0]
  107. _time = t.cost_time
  108. for _key in _time.keys():
  109. if _key not in cost_time:
  110. cost_time[_key] = 0
  111. cost_time[_key] += _time[_key]
  112. num_200 = len(t.num_200)
  113. num_other = len(t.num_other)
  114. print("==================")
  115. print("cost:",time.time()-_start_time)
  116. print("num_200:",num_200,"num_other:",num_other)
  117. print(cost_time)
  118. if __name__=="__main__":
  119. import os
  120. os.environ
  121. text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
  122. start_time = time.time()
  123. content = str(BeautifulSoup(text,'lxml').find("div",id="pcontent"))
  124. print(content)
  125. test("12",content)
  126. print("takes %d"%(time.time()-start_time))
  127. '''
  128. data = load("list_contents.pk")
  129. for item in data[:100]:
  130. filename = item[0]
  131. content = item[1]
  132. a = time.time()
  133. #predict("12",content)
  134. test("12",content)
  135. print("takes",time.time()-a)
  136. break
  137. test_highConcurrency()
  138. '''