convert_need_interface.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. import base64
  2. import inspect
  3. import json
  4. import logging
  5. import os
  6. import random
  7. import sys
  8. from werkzeug.exceptions import NotFound
  9. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  10. import traceback
  11. import requests
  12. from format_convert import _global
  13. from format_convert.utils import get_platform, get_sequential_data, judge_error_code, request_post, get_ip_port, \
  14. get_intranet_ip, get_logger, log, memory_decorator
  15. from ocr.ocr_interface import ocr, OcrModels
  16. from otr.otr_interface import otr, OtrModels
  17. from format_convert.libreoffice_interface import office_convert
  18. # 远程GPU接口
  19. # # interface_ip_list = ['http://192.168.2.102', 'http://192.168.2.103']
  20. # # interface_ip_list = ['http://172.16.160.65', 'http://172.16.160.64', 'http://172.16.160.66', 'http://172.16.160.67']
  21. # interface_ip_list = ['http://172.16.160.65', 'http://172.16.160.65']
  22. # # ocr_port_list = ["15011", "15013", "15015"]
  23. # # ocr_port_list = ["15011", "15013", "15015", "15017", "15019"]
  24. # # otr_port_list = ["15012", "15014", "15016", "15018", "15020"]
  25. # ocr_port_list = ["15011", "15013", "15015"]
  26. # otr_port_list = ["15012", "15014", "15016"]
  27. # # ocr_port_list = ["15011", "15013", "15015", "15017", "15019", "15021"]
  28. # # otr_port_list = ["15012", "15014", "15016", "15018", "15020", "15022"]
  29. # soffice_port_list = ["16000", "16001", "16002", "16003", "16004", "16005",
  30. # "16006", "16007", "16008", "16009"]
  31. # # ocr_port_list = ["15011", "15013"]
  32. # # otr_port_list = ["15012"]
  33. if get_platform() == "Windows":
  34. FROM_REMOTE = False
  35. else:
  36. FROM_REMOTE = True
  37. # _global = {}
  38. # ip_port_flag = {}
  39. # ip_port_dict = get_ip_port()
  40. # for _k in ip_port_dict.keys():
  41. # ip_port_flag.update({_k: {"ocr": 0,
  42. # "otr": 0,
  43. # "convert": 0,
  44. # "office": 0
  45. # }})
  46. # _global.update({"ip_port_flag": ip_port_flag})
  47. # _global.update({"ip_port": ip_port_dict})
  48. def from_office_interface(src_path, dest_path, target_format, retry_times=1, from_remote=FROM_REMOTE):
  49. try:
  50. # Win10跳出超时装饰器
  51. # if get_platform() == "Windows":
  52. # # origin_office_convert = office_convert.__wrapped__
  53. # # file_path = origin_office_convert(src_path, dest_path, target_format, retry_times)
  54. # file_path = office_convert(src_path, dest_path, target_format, retry_times)
  55. # else:
  56. # # 将装饰器包装为一个类,否则多进程Pickle会报错 it's not the same object as xxx 问题,
  57. # # timeout_decorator_obj = my_timeout_decorator.TimeoutClass(office_convert, 180, TimeoutError)
  58. # # file_path = timeout_decorator_obj.run(src_path, dest_path, target_format, retry_times)
  59. #
  60. # file_path = office_convert(src_path, dest_path, target_format, retry_times)
  61. if from_remote:
  62. # 重试
  63. retry_times_1 = 1
  64. retry_times_2 = 2
  65. while retry_times_1 and retry_times_2:
  66. # _ip = ip_pool("soffice", _random=True)
  67. # _port = port_pool("soffice", _random=True)
  68. # _ip = interface_ip_list[0]
  69. # _port = "16002"
  70. # _ip, _port = interface_pool("soffice")
  71. # ip_port = from_schedule_interface("office")
  72. ip_port = interface_pool("office")
  73. if judge_error_code(ip_port):
  74. return ip_port
  75. _url = ip_port + "/soffice"
  76. with open(src_path, "rb") as f:
  77. file_bytes = f.read()
  78. base64_stream = base64.b64encode(file_bytes)
  79. r = json.loads(request_post(_url, {"src_path": src_path,
  80. "dest_path": dest_path,
  81. "file": base64_stream,
  82. "target_format": target_format,
  83. "retry_times": retry_times}, time_out=15))
  84. if type(r) == list:
  85. # 接口连不上换个端口重试
  86. if retry_times_1 <= 1:
  87. return r
  88. else:
  89. retry_times_1 -= 1
  90. log("retry post office_interface... left times " + str(retry_times_1))
  91. continue
  92. file_str = r.get("data")
  93. if judge_error_code(file_str):
  94. if retry_times_2 <= 1:
  95. return file_str
  96. else:
  97. retry_times_2 -= 1
  98. continue
  99. file_bytes = eval(file_str)
  100. uid1 = src_path.split(os.sep)[-1].split(".")[0]
  101. file_path = dest_path + uid1 + "." + target_format
  102. if not os.path.exists(os.path.dirname(file_path)):
  103. os.makedirs(os.path.dirname(file_path), mode=0o777)
  104. with open(file_path, "wb") as f:
  105. f.write(file_bytes)
  106. break
  107. else:
  108. file_path = office_convert(src_path, dest_path, target_format, retry_times)
  109. if judge_error_code(file_path):
  110. return file_path
  111. return file_path
  112. except TimeoutError:
  113. log("from_office_interface timeout error!")
  114. return [-5]
  115. except:
  116. log("from_office_interface error!")
  117. print("from_office_interface", traceback.print_exc())
  118. return [-1]
  119. def from_ocr_interface(image_stream, is_table=False, from_remote=FROM_REMOTE):
  120. log("into from_ocr_interface")
  121. try:
  122. base64_stream = base64.b64encode(image_stream)
  123. # 调用接口
  124. try:
  125. if from_remote:
  126. retry_times_1 = 3
  127. # 重试
  128. while retry_times_1:
  129. # _ip = ip_pool("ocr", _random=True)
  130. # _port = port_pool("ocr", _random=True)
  131. # if _ip == interface_ip_list[1]:
  132. # _port = ocr_port_list[0]
  133. # _ip, _port = interface_pool("ocr")
  134. # ip_port = _ip + ":" + _port
  135. # ip_port = from_schedule_interface("ocr")
  136. ip_port = interface_pool("ocr")
  137. if judge_error_code(ip_port):
  138. return ip_port
  139. _url = ip_port + "/ocr"
  140. r = json.loads(request_post(_url, {"data": base64_stream}, time_out=60))
  141. if type(r) == list:
  142. # 接口连不上换个端口重试
  143. if retry_times_1 <= 1:
  144. if is_table:
  145. return r, r
  146. else:
  147. return r
  148. else:
  149. retry_times_1 -= 1
  150. log("retry post ocr_interface... left times " + str(retry_times_1))
  151. continue
  152. if judge_error_code(r):
  153. return r
  154. break
  155. else:
  156. if globals().get("global_ocr_model") is None:
  157. globals().update({"global_ocr_model": OcrModels().get_model()})
  158. print("=========== init ocr model ===========")
  159. r = ocr(data=base64_stream, ocr_model=globals().get("global_ocr_model"))
  160. except TimeoutError:
  161. if is_table:
  162. return [-5], [-5]
  163. else:
  164. return [-5]
  165. except requests.exceptions.ConnectionError as e:
  166. if is_table:
  167. return [-2], [-2]
  168. else:
  169. return [-2]
  170. _dict = r
  171. text_list = eval(_dict.get("text"))
  172. bbox_list = eval(_dict.get("bbox"))
  173. if text_list is None:
  174. text_list = []
  175. if bbox_list is None:
  176. bbox_list = []
  177. if is_table:
  178. return text_list, bbox_list
  179. else:
  180. if text_list and bbox_list:
  181. text = get_sequential_data(text_list, bbox_list, html=True)
  182. if judge_error_code(text):
  183. return text
  184. else:
  185. text = ""
  186. return text
  187. except Exception as e:
  188. log("from_ocr_interface error!")
  189. # print("from_ocr_interface", e, global_type)
  190. if is_table:
  191. return [-1], [-1]
  192. else:
  193. return [-1]
  194. def from_otr_interface2(image_stream):
  195. log("into from_otr_interface")
  196. try:
  197. base64_stream = base64.b64encode(image_stream)
  198. # 调用接口
  199. try:
  200. if globals().get("global_otr_model") is None:
  201. globals().update({"global_otr_model": OtrModels().get_model()})
  202. print("=========== init otr model ===========")
  203. r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"))
  204. except TimeoutError:
  205. return [-5], [-5], [-5], [-5], [-5]
  206. except requests.exceptions.ConnectionError as e:
  207. log("from_otr_interface")
  208. print("from_otr_interface", traceback.print_exc())
  209. return [-2], [-2], [-2], [-2], [-2]
  210. # 处理结果
  211. _dict = r
  212. points = eval(_dict.get("points"))
  213. split_lines = eval(_dict.get("split_lines"))
  214. bboxes = eval(_dict.get("bboxes"))
  215. outline_points = eval(_dict.get("outline_points"))
  216. lines = eval(_dict.get("lines"))
  217. # print("from_otr_interface len(bboxes)", len(bboxes))
  218. if points is None:
  219. points = []
  220. if split_lines is None:
  221. split_lines = []
  222. if bboxes is None:
  223. bboxes = []
  224. if outline_points is None:
  225. outline_points = []
  226. if lines is None:
  227. lines = []
  228. return points, split_lines, bboxes, outline_points, lines
  229. except Exception as e:
  230. log("from_otr_interface error!")
  231. print("from_otr_interface", traceback.print_exc())
  232. return [-1], [-1], [-1], [-1], [-1]
  233. def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE):
  234. log("into from_otr_interface")
  235. try:
  236. base64_stream = base64.b64encode(image_stream)
  237. # 调用接口
  238. try:
  239. if from_remote:
  240. retry_times_1 = 3
  241. # 重试
  242. while retry_times_1:
  243. # _ip = ip_pool("otr", _random=True)
  244. # _port = port_pool("otr", _random=True)
  245. # if _ip == interface_ip_list[1]:
  246. # _port = otr_port_list[0]
  247. ip_port = interface_pool("otr")
  248. # ip_port = from_schedule_interface("otr")
  249. if judge_error_code(ip_port):
  250. return ip_port
  251. _url = ip_port + "/otr"
  252. r = json.loads(request_post(_url, {"data": base64_stream, "is_from_pdf": is_from_pdf}, time_out=60))
  253. if type(r) == list:
  254. # 接口连不上换个端口重试
  255. if retry_times_1 <= 1:
  256. return r
  257. else:
  258. retry_times_1 -= 1
  259. log("retry post otr_interface... left times " + str(retry_times_1))
  260. continue
  261. if judge_error_code(r):
  262. return r
  263. break
  264. else:
  265. if globals().get("global_otr_model") is None:
  266. globals().update({"global_otr_model": OtrModels().get_model()})
  267. print("=========== init otr model ===========")
  268. r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"), is_from_pdf=is_from_pdf)
  269. except TimeoutError:
  270. return [-5]
  271. except requests.exceptions.ConnectionError as e:
  272. log("from_otr_interface")
  273. print("from_otr_interface", traceback.print_exc())
  274. return [-2]
  275. # 处理结果
  276. _dict = r
  277. list_line = eval(_dict.get("list_line"))
  278. return list_line
  279. except Exception as e:
  280. log("from_otr_interface error!")
  281. print("from_otr_interface", traceback.print_exc())
  282. return [-1]
  283. # def from_schedule_interface(interface_type):
  284. # try:
  285. # _ip = "http://" + get_intranet_ip()
  286. # _port = ip_port_dict.get(_ip).get("schedule")[0]
  287. # _url = _ip + ":" + _port + "/schedule"
  288. # data = {"interface_type": interface_type}
  289. # result = json.loads(request_post(_url, data, time_out=10)).get("data")
  290. # if judge_error_code(result):
  291. # return result
  292. # _ip, _port = result
  293. # log("from_schedule_interface " + _ip + " " + _port)
  294. # return _ip + ":" + _port
  295. # except requests.exceptions.ConnectionError as e:
  296. # log("from_schedule_interface ConnectionError")
  297. # return [-2]
  298. # except:
  299. # log("from_schedule_interface error!")
  300. # traceback.print_exc()
  301. # return [-1]
  302. def interface_pool(interface_type):
  303. ip_port_flag = _global.get("ip_port_flag")
  304. ip_port_dict = _global.get("ip_port")
  305. log(str(_global.get("ip_port_flag")))
  306. try:
  307. # 负载均衡, 选取ip
  308. interface_load_list = []
  309. for _ip in ip_port_flag.keys():
  310. if ip_port_dict.get(_ip).get(interface_type):
  311. load_scale = ip_port_flag.get(_ip).get(interface_type) / len(ip_port_dict.get(_ip).get(interface_type))
  312. interface_load_list.append([_ip, load_scale])
  313. if not interface_load_list:
  314. raise NotFound
  315. interface_load_list.sort(key=lambda x: x[-1])
  316. _ip = interface_load_list[0][0]
  317. # 负载均衡, 选取port
  318. ip_type_cnt = ip_port_flag.get(_ip).get(interface_type)
  319. ip_type_total = len(ip_port_dict.get(_ip).get(interface_type))
  320. if ip_type_cnt == 0:
  321. ip_type_cnt = random.randint(0, ip_type_total-1)
  322. port_index = ip_type_cnt % ip_type_total
  323. _port = ip_port_dict.get(_ip).get(interface_type)[port_index]
  324. # 更新flag
  325. current_flag = ip_type_cnt
  326. if current_flag >= 10000:
  327. ip_port_flag[_ip][interface_type] = 0
  328. else:
  329. ip_port_flag[_ip][interface_type] = current_flag + 1
  330. _global.update({"ip_port_flag": ip_port_flag})
  331. log(str(_global.get("ip_port_flag")))
  332. ip_port = _ip + ":" + str(_port)
  333. log(ip_port)
  334. return ip_port
  335. except NotFound:
  336. log("cannot read ip from config! checkout config")
  337. return [-2]
  338. except:
  339. traceback.print_exc()
  340. return [-1]
  341. # def ip_pool(interface_type, _random=False):
  342. # ip_flag_name = interface_type + '_ip_flag'
  343. # ip_flag = globals().get(ip_flag_name)
  344. # if ip_flag is None:
  345. # if _random:
  346. # _r = random.randint(0, len(interface_ip_list)-1)
  347. # ip_flag = _r
  348. # globals().update({ip_flag_name: ip_flag})
  349. # ip_index = _r
  350. # else:
  351. # ip_flag = 0
  352. # globals().update({ip_flag_name: ip_flag})
  353. # ip_index = 0
  354. # else:
  355. # ip_index = ip_flag % len(interface_ip_list)
  356. # ip_flag += 1
  357. #
  358. # if ip_flag >= 10000:
  359. # ip_flag = 0
  360. # globals().update({ip_flag_name: ip_flag})
  361. #
  362. # log("ip_pool " + interface_type + " " + str(ip_flag) + " " + str(interface_ip_list[ip_index]))
  363. # return interface_ip_list[ip_index]
  364. #
  365. #
  366. # def port_pool(interface_type, _random=False):
  367. # port_flag_name = interface_type + '_port_flag'
  368. #
  369. # port_flag = globals().get(port_flag_name)
  370. # if port_flag is None:
  371. # if _random:
  372. # if interface_type == "ocr":
  373. # _r = random.randint(0, len(ocr_port_list)-1)
  374. # elif interface_type == "otr":
  375. # _r = random.randint(0, len(otr_port_list)-1)
  376. # else:
  377. # _r = random.randint(0, len(soffice_port_list)-1)
  378. # port_flag = _r
  379. # globals().update({port_flag_name: port_flag})
  380. # port_index = _r
  381. # else:
  382. # port_flag = 0
  383. # globals().update({port_flag_name: port_flag})
  384. # port_index = 0
  385. # else:
  386. # if interface_type == "ocr":
  387. # port_index = port_flag % len(ocr_port_list)
  388. # elif interface_type == "otr":
  389. # port_index = port_flag % len(otr_port_list)
  390. # else:
  391. # port_index = port_flag % len(soffice_port_list)
  392. # port_flag += 1
  393. #
  394. # if port_flag >= 10000:
  395. # port_flag = 0
  396. # globals().update({port_flag_name: port_flag})
  397. #
  398. # if interface_type == "ocr":
  399. # log("port_pool " + interface_type + " " + str(port_flag) + " " + ocr_port_list[port_index])
  400. # return ocr_port_list[port_index]
  401. # elif interface_type == "otr":
  402. # log("port_pool " + interface_type + " " + str(port_flag) + " " + otr_port_list[port_index])
  403. # return otr_port_list[port_index]
  404. # else:
  405. # log("port_pool " + interface_type + " " + str(port_flag) + " " + soffice_port_list[port_index])
  406. # return soffice_port_list[port_index]