convert_need_interface.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. import base64
  2. import inspect
  3. import json
  4. import logging
  5. import os
  6. import random
  7. import sys
  8. import time
  9. from werkzeug.exceptions import NotFound
  10. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  11. import traceback
  12. import requests
  13. from format_convert import _global
  14. from format_convert.utils import get_platform, get_sequential_data, judge_error_code, request_post, get_ip_port, \
  15. get_intranet_ip, get_logger, log, memory_decorator
  16. from ocr.ocr_interface import ocr, OcrModels
  17. from otr.otr_interface import otr, OtrModels
  18. from format_convert.libreoffice_interface import office_convert
  19. # 远程GPU接口
  20. # # interface_ip_list = ['http://192.168.2.102', 'http://192.168.2.103']
  21. # # interface_ip_list = ['http://172.16.160.65', 'http://172.16.160.64', 'http://172.16.160.66', 'http://172.16.160.67']
  22. # interface_ip_list = ['http://172.16.160.65', 'http://172.16.160.65']
  23. # # ocr_port_list = ["15011", "15013", "15015"]
  24. # # ocr_port_list = ["15011", "15013", "15015", "15017", "15019"]
  25. # # otr_port_list = ["15012", "15014", "15016", "15018", "15020"]
  26. # ocr_port_list = ["15011", "15013", "15015"]
  27. # otr_port_list = ["15012", "15014", "15016"]
  28. # # ocr_port_list = ["15011", "15013", "15015", "15017", "15019", "15021"]
  29. # # otr_port_list = ["15012", "15014", "15016", "15018", "15020", "15022"]
  30. # soffice_port_list = ["16000", "16001", "16002", "16003", "16004", "16005",
  31. # "16006", "16007", "16008", "16009"]
  32. # # ocr_port_list = ["15011", "15013"]
  33. # # otr_port_list = ["15012"]
  34. if get_platform() == "Windows":
  35. FROM_REMOTE = False
  36. else:
  37. FROM_REMOTE = True
  38. # _global = {}
  39. # ip_port_flag = {}
  40. # ip_port_dict = get_ip_port()
  41. # for _k in ip_port_dict.keys():
  42. # ip_port_flag.update({_k: {"ocr": 0,
  43. # "otr": 0,
  44. # "convert": 0,
  45. # "office": 0
  46. # }})
  47. # _global.update({"ip_port_flag": ip_port_flag})
  48. # _global.update({"ip_port": ip_port_dict})
  49. def from_office_interface(src_path, dest_path, target_format, retry_times=1, from_remote=FROM_REMOTE):
  50. try:
  51. # Win10跳出超时装饰器
  52. # if get_platform() == "Windows":
  53. # # origin_office_convert = office_convert.__wrapped__
  54. # # file_path = origin_office_convert(src_path, dest_path, target_format, retry_times)
  55. # file_path = office_convert(src_path, dest_path, target_format, retry_times)
  56. # else:
  57. # # 将装饰器包装为一个类,否则多进程Pickle会报错 it's not the same object as xxx 问题,
  58. # # timeout_decorator_obj = my_timeout_decorator.TimeoutClass(office_convert, 180, TimeoutError)
  59. # # file_path = timeout_decorator_obj.run(src_path, dest_path, target_format, retry_times)
  60. #
  61. # file_path = office_convert(src_path, dest_path, target_format, retry_times)
  62. if from_remote:
  63. # 重试
  64. retry_times_1 = 1
  65. retry_times_2 = 2
  66. while retry_times_1 and retry_times_2:
  67. # _ip = ip_pool("soffice", _random=True)
  68. # _port = port_pool("soffice", _random=True)
  69. # _ip = interface_ip_list[0]
  70. # _port = "16002"
  71. # _ip, _port = interface_pool("soffice")
  72. # ip_port = from_schedule_interface("office")
  73. ip_port = interface_pool("office")
  74. if judge_error_code(ip_port):
  75. return ip_port
  76. _url = ip_port + "/soffice"
  77. with open(src_path, "rb") as f:
  78. file_bytes = f.read()
  79. base64_stream = base64.b64encode(file_bytes)
  80. start_time = time.time()
  81. r = json.loads(request_post(_url, {"src_path": src_path,
  82. "dest_path": dest_path,
  83. "file": base64_stream,
  84. "target_format": target_format,
  85. "retry_times": retry_times}, time_out=25))
  86. log("office use time " + str(time.time()-start_time))
  87. if type(r) == list:
  88. # 接口连不上换个端口重试
  89. if retry_times_1 <= 1:
  90. return r
  91. else:
  92. retry_times_1 -= 1
  93. log("retry post office_interface... left times " + str(retry_times_1))
  94. continue
  95. file_str = r.get("data")
  96. if judge_error_code(file_str):
  97. if retry_times_2 <= 1:
  98. return file_str
  99. else:
  100. retry_times_2 -= 1
  101. continue
  102. file_bytes = eval(file_str)
  103. uid1 = src_path.split(os.sep)[-1].split(".")[0]
  104. file_path = dest_path + uid1 + "." + target_format
  105. if not os.path.exists(os.path.dirname(file_path)):
  106. os.makedirs(os.path.dirname(file_path), mode=0o777)
  107. with open(file_path, "wb") as f:
  108. f.write(file_bytes)
  109. break
  110. else:
  111. file_path = office_convert(src_path, dest_path, target_format, retry_times)
  112. if judge_error_code(file_path):
  113. return file_path
  114. return file_path
  115. except TimeoutError:
  116. log("from_office_interface timeout error!")
  117. return [-5]
  118. except:
  119. log("from_office_interface error!")
  120. print("from_office_interface", traceback.print_exc())
  121. return [-1]
  122. def from_ocr_interface(image_stream, is_table=False, from_remote=FROM_REMOTE):
  123. log("into from_ocr_interface")
  124. try:
  125. base64_stream = base64.b64encode(image_stream)
  126. # 调用接口
  127. try:
  128. if from_remote:
  129. retry_times_1 = 3
  130. # 重试
  131. while retry_times_1:
  132. # _ip = ip_pool("ocr", _random=True)
  133. # _port = port_pool("ocr", _random=True)
  134. # if _ip == interface_ip_list[1]:
  135. # _port = ocr_port_list[0]
  136. # _ip, _port = interface_pool("ocr")
  137. # ip_port = _ip + ":" + _port
  138. # ip_port = from_schedule_interface("ocr")
  139. ip_port = interface_pool("ocr")
  140. if judge_error_code(ip_port):
  141. return ip_port
  142. _url = ip_port + "/ocr"
  143. r = json.loads(request_post(_url, {"data": base64_stream}, time_out=60))
  144. if type(r) == list:
  145. # 接口连不上换个端口重试
  146. if retry_times_1 <= 1:
  147. if is_table:
  148. return r, r
  149. else:
  150. return r
  151. else:
  152. retry_times_1 -= 1
  153. log("retry post ocr_interface... left times " + str(retry_times_1))
  154. continue
  155. if judge_error_code(r):
  156. return r
  157. break
  158. else:
  159. if globals().get("global_ocr_model") is None:
  160. globals().update({"global_ocr_model": OcrModels().get_model()})
  161. print("=========== init ocr model ===========")
  162. r = ocr(data=base64_stream, ocr_model=globals().get("global_ocr_model"))
  163. except TimeoutError:
  164. if is_table:
  165. return [-5], [-5]
  166. else:
  167. return [-5]
  168. except requests.exceptions.ConnectionError as e:
  169. if is_table:
  170. return [-2], [-2]
  171. else:
  172. return [-2]
  173. _dict = r
  174. text_list = eval(_dict.get("text"))
  175. bbox_list = eval(_dict.get("bbox"))
  176. if text_list is None:
  177. text_list = []
  178. if bbox_list is None:
  179. bbox_list = []
  180. if is_table:
  181. return text_list, bbox_list
  182. else:
  183. if text_list and bbox_list:
  184. text = get_sequential_data(text_list, bbox_list, html=True)
  185. if judge_error_code(text):
  186. return text
  187. else:
  188. text = ""
  189. return text
  190. except Exception as e:
  191. log("from_ocr_interface error!")
  192. # print("from_ocr_interface", e, global_type)
  193. if is_table:
  194. return [-1], [-1]
  195. else:
  196. return [-1]
  197. def from_otr_interface2(image_stream):
  198. log("into from_otr_interface")
  199. try:
  200. base64_stream = base64.b64encode(image_stream)
  201. # 调用接口
  202. try:
  203. if globals().get("global_otr_model") is None:
  204. globals().update({"global_otr_model": OtrModels().get_model()})
  205. print("=========== init otr model ===========")
  206. r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"))
  207. except TimeoutError:
  208. return [-5], [-5], [-5], [-5], [-5]
  209. except requests.exceptions.ConnectionError as e:
  210. log("from_otr_interface")
  211. print("from_otr_interface", traceback.print_exc())
  212. return [-2], [-2], [-2], [-2], [-2]
  213. # 处理结果
  214. _dict = r
  215. points = eval(_dict.get("points"))
  216. split_lines = eval(_dict.get("split_lines"))
  217. bboxes = eval(_dict.get("bboxes"))
  218. outline_points = eval(_dict.get("outline_points"))
  219. lines = eval(_dict.get("lines"))
  220. # print("from_otr_interface len(bboxes)", len(bboxes))
  221. if points is None:
  222. points = []
  223. if split_lines is None:
  224. split_lines = []
  225. if bboxes is None:
  226. bboxes = []
  227. if outline_points is None:
  228. outline_points = []
  229. if lines is None:
  230. lines = []
  231. return points, split_lines, bboxes, outline_points, lines
  232. except Exception as e:
  233. log("from_otr_interface error!")
  234. print("from_otr_interface", traceback.print_exc())
  235. return [-1], [-1], [-1], [-1], [-1]
  236. def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE):
  237. log("into from_otr_interface")
  238. try:
  239. base64_stream = base64.b64encode(image_stream)
  240. # 调用接口
  241. try:
  242. if from_remote:
  243. retry_times_1 = 3
  244. # 重试
  245. while retry_times_1:
  246. # _ip = ip_pool("otr", _random=True)
  247. # _port = port_pool("otr", _random=True)
  248. # if _ip == interface_ip_list[1]:
  249. # _port = otr_port_list[0]
  250. ip_port = interface_pool("otr")
  251. # ip_port = from_schedule_interface("otr")
  252. if judge_error_code(ip_port):
  253. return ip_port
  254. _url = ip_port + "/otr"
  255. r = json.loads(request_post(_url, {"data": base64_stream, "is_from_pdf": is_from_pdf}, time_out=60))
  256. if type(r) == list:
  257. # 接口连不上换个端口重试
  258. if retry_times_1 <= 1:
  259. return r
  260. else:
  261. retry_times_1 -= 1
  262. log("retry post otr_interface... left times " + str(retry_times_1))
  263. continue
  264. if judge_error_code(r):
  265. return r
  266. break
  267. else:
  268. if globals().get("global_otr_model") is None:
  269. globals().update({"global_otr_model": OtrModels().get_model()})
  270. print("=========== init otr model ===========")
  271. r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"), is_from_pdf=is_from_pdf)
  272. except TimeoutError:
  273. return [-5]
  274. except requests.exceptions.ConnectionError as e:
  275. log("from_otr_interface")
  276. print("from_otr_interface", traceback.print_exc())
  277. return [-2]
  278. # 处理结果
  279. _dict = r
  280. list_line = eval(_dict.get("list_line"))
  281. return list_line
  282. except Exception as e:
  283. log("from_otr_interface error!")
  284. print("from_otr_interface", traceback.print_exc())
  285. return [-1]
  286. # def from_schedule_interface(interface_type):
  287. # try:
  288. # _ip = "http://" + get_intranet_ip()
  289. # _port = ip_port_dict.get(_ip).get("schedule")[0]
  290. # _url = _ip + ":" + _port + "/schedule"
  291. # data = {"interface_type": interface_type}
  292. # result = json.loads(request_post(_url, data, time_out=10)).get("data")
  293. # if judge_error_code(result):
  294. # return result
  295. # _ip, _port = result
  296. # log("from_schedule_interface " + _ip + " " + _port)
  297. # return _ip + ":" + _port
  298. # except requests.exceptions.ConnectionError as e:
  299. # log("from_schedule_interface ConnectionError")
  300. # return [-2]
  301. # except:
  302. # log("from_schedule_interface error!")
  303. # traceback.print_exc()
  304. # return [-1]
  305. def interface_pool(interface_type):
  306. ip_port_flag = _global.get("ip_port_flag")
  307. ip_port_dict = _global.get("ip_port")
  308. try:
  309. # 负载均衡, 选取ip
  310. interface_load_list = []
  311. for _ip in ip_port_flag.keys():
  312. if ip_port_dict.get(_ip).get(interface_type):
  313. load_scale = ip_port_flag.get(_ip).get(interface_type) / len(ip_port_dict.get(_ip).get(interface_type))
  314. interface_load_list.append([_ip, load_scale])
  315. if not interface_load_list:
  316. raise NotFound
  317. interface_load_list.sort(key=lambda x: x[-1])
  318. _ip = interface_load_list[0][0]
  319. # 负载均衡, 选取port
  320. ip_type_cnt = ip_port_flag.get(_ip).get(interface_type)
  321. ip_type_total = len(ip_port_dict.get(_ip).get(interface_type))
  322. if ip_type_cnt == 0:
  323. ip_type_cnt = random.randint(0, ip_type_total-1)
  324. port_index = ip_type_cnt % ip_type_total
  325. _port = ip_port_dict.get(_ip).get(interface_type)[port_index]
  326. # 更新flag
  327. current_flag = ip_type_cnt
  328. if current_flag >= 10000:
  329. ip_port_flag[_ip][interface_type] = 0
  330. else:
  331. ip_port_flag[_ip][interface_type] = current_flag + 1
  332. _global.update({"ip_port_flag": ip_port_flag})
  333. log(str(_global.get("ip_port_flag")))
  334. ip_port = _ip + ":" + str(_port)
  335. log(ip_port)
  336. return ip_port
  337. except NotFound:
  338. log("cannot read ip from config! checkout config")
  339. return [-2]
  340. except:
  341. traceback.print_exc()
  342. return [-1]
  343. # def interface_pool(interface_type):
  344. # try:
  345. # ip_port_dict = _global.get("ip_port")
  346. # ip_list = list(ip_port_dict.keys())
  347. # _ip = random.choice(ip_list)
  348. # if interface_type != 'office':
  349. # _port = ip_port_dict.get(_ip).get(interface_type)[0]
  350. # else:
  351. # _port = random.choice(ip_port_dict.get(_ip).get(interface_type))
  352. # log(_ip + ":" + _port)
  353. # return _ip + ":" + _port
  354. # except Exception as e:
  355. # traceback.print_exc()
  356. # return [-1]
  357. # def ip_pool(interface_type, _random=False):
  358. # ip_flag_name = interface_type + '_ip_flag'
  359. # ip_flag = globals().get(ip_flag_name)
  360. # if ip_flag is None:
  361. # if _random:
  362. # _r = random.randint(0, len(interface_ip_list)-1)
  363. # ip_flag = _r
  364. # globals().update({ip_flag_name: ip_flag})
  365. # ip_index = _r
  366. # else:
  367. # ip_flag = 0
  368. # globals().update({ip_flag_name: ip_flag})
  369. # ip_index = 0
  370. # else:
  371. # ip_index = ip_flag % len(interface_ip_list)
  372. # ip_flag += 1
  373. #
  374. # if ip_flag >= 10000:
  375. # ip_flag = 0
  376. # globals().update({ip_flag_name: ip_flag})
  377. #
  378. # log("ip_pool " + interface_type + " " + str(ip_flag) + " " + str(interface_ip_list[ip_index]))
  379. # return interface_ip_list[ip_index]
  380. #
  381. #
  382. # def port_pool(interface_type, _random=False):
  383. # port_flag_name = interface_type + '_port_flag'
  384. #
  385. # port_flag = globals().get(port_flag_name)
  386. # if port_flag is None:
  387. # if _random:
  388. # if interface_type == "ocr":
  389. # _r = random.randint(0, len(ocr_port_list)-1)
  390. # elif interface_type == "otr":
  391. # _r = random.randint(0, len(otr_port_list)-1)
  392. # else:
  393. # _r = random.randint(0, len(soffice_port_list)-1)
  394. # port_flag = _r
  395. # globals().update({port_flag_name: port_flag})
  396. # port_index = _r
  397. # else:
  398. # port_flag = 0
  399. # globals().update({port_flag_name: port_flag})
  400. # port_index = 0
  401. # else:
  402. # if interface_type == "ocr":
  403. # port_index = port_flag % len(ocr_port_list)
  404. # elif interface_type == "otr":
  405. # port_index = port_flag % len(otr_port_list)
  406. # else:
  407. # port_index = port_flag % len(soffice_port_list)
  408. # port_flag += 1
  409. #
  410. # if port_flag >= 10000:
  411. # port_flag = 0
  412. # globals().update({port_flag_name: port_flag})
  413. #
  414. # if interface_type == "ocr":
  415. # log("port_pool " + interface_type + " " + str(port_flag) + " " + ocr_port_list[port_index])
  416. # return ocr_port_list[port_index]
  417. # elif interface_type == "otr":
  418. # log("port_pool " + interface_type + " " + str(port_flag) + " " + otr_port_list[port_index])
  419. # return otr_port_list[port_index]
  420. # else:
  421. # log("port_pool " + interface_type + " " + str(port_flag) + " " + soffice_port_list[port_index])
  422. # return soffice_port_list[port_index]