convert.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702
  1. #-*- coding: utf-8 -*-
  2. import json
  3. import sys
  4. import os
  5. from io import BytesIO
  6. import objgraph
  7. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  8. # 强制tf使用cpu
  9. os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
  10. from format_convert.utils import judge_error_code, request_post, get_intranet_ip, get_ip_port, get_logger, log, \
  11. set_flask_global, get_md5_from_bytes
  12. from format_convert.convert_doc import doc2text, DocConvert
  13. from format_convert.convert_docx import docx2text, DocxConvert
  14. from format_convert.convert_image import picture2text, ImageConvert
  15. from format_convert.convert_pdf import pdf2text, PDFConvert
  16. from format_convert.convert_rar import rar2text, RarConvert
  17. from format_convert.convert_swf import swf2text, SwfConvert
  18. from format_convert.convert_txt import txt2text, TxtConvert
  19. from format_convert.convert_xls import xls2text, XlsConvert
  20. from format_convert.convert_xlsx import xlsx2text, XlsxConvert
  21. from format_convert.convert_zip import zip2text, ZipConvert
  22. import hashlib
  23. from format_convert import get_memory_info
  24. from format_convert.judge_platform import get_platform
  25. from ocr import ocr_interface
  26. from otr import otr_interface
  27. import re
  28. import shutil
  29. import base64
  30. import time
  31. import uuid
  32. import logging
  33. from bs4 import BeautifulSoup
  34. from flask import Flask, request, g
  35. import inspect
  36. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  37. from format_convert.table_correct import *
  38. import logging
  39. from format_convert import timeout_decorator
  40. from format_convert.wrapt_timeout_decorator import *
  41. from format_convert import _global
  42. port_num = [0]
  43. def choose_port():
  44. process_num = 4
  45. if port_num[0] % process_num == 0:
  46. _url = local_url + ":15011"
  47. elif port_num[0] % process_num == 1:
  48. _url = local_url + ":15012"
  49. elif port_num[0] % process_num == 2:
  50. _url = local_url + ":15013"
  51. elif port_num[0] % process_num == 3:
  52. _url = local_url + ":15014"
  53. port_num[0] = port_num[0] + 1
  54. return _url
  55. def getText(_type, path_or_stream):
  56. print("file type - " + _type)
  57. log("file type - " + _type)
  58. try:
  59. ss = path_or_stream.split(".")
  60. unique_type_dir = ss[-2] + "_" + ss[-1] + os.sep
  61. except:
  62. unique_type_dir = path_or_stream + "_" + _type + os.sep
  63. if _type == "pdf":
  64. # return pdf2text(path_or_stream, unique_type_dir)
  65. return PDFConvert(path_or_stream, unique_type_dir).get_html()
  66. if _type == "docx":
  67. # return docx2text(path_or_stream, unique_type_dir)
  68. return DocxConvert(path_or_stream, unique_type_dir).get_html()
  69. if _type == "zip":
  70. # return zip2text(path_or_stream, unique_type_dir)
  71. return ZipConvert(path_or_stream, unique_type_dir).get_html()
  72. if _type == "rar":
  73. # return rar2text(path_or_stream, unique_type_dir)
  74. return RarConvert(path_or_stream, unique_type_dir).get_html()
  75. if _type == "xlsx":
  76. # return xlsx2text(path_or_stream, unique_type_dir)
  77. return XlsxConvert(path_or_stream, unique_type_dir).get_html()
  78. if _type == "xls":
  79. # return xls2text(path_or_stream, unique_type_dir)
  80. return XlsConvert(path_or_stream, unique_type_dir).get_html()
  81. if _type == "doc":
  82. # return doc2text(path_or_stream, unique_type_dir)
  83. return DocConvert(path_or_stream, unique_type_dir).get_html()
  84. if _type == "jpg" or _type == "png" or _type == "jpeg":
  85. # return picture2text(path_or_stream)
  86. return ImageConvert(path_or_stream, unique_type_dir).get_html()
  87. if _type == "swf":
  88. # return swf2text(path_or_stream, unique_type_dir)
  89. return SwfConvert(path_or_stream, unique_type_dir).get_html()
  90. if _type == "txt":
  91. # return txt2text(path_or_stream)
  92. return TxtConvert(path_or_stream, unique_type_dir).get_html()
  93. return [""]
  94. def to_html(path, text):
  95. with open(path, 'w',encoding="utf8") as f:
  96. f.write("<!DOCTYPE HTML>")
  97. f.write('<head><meta charset="UTF-8"></head>')
  98. f.write("<body>")
  99. f.write(text)
  100. f.write("</body>")
  101. def resize_image(image_path, size):
  102. try:
  103. image_np = cv2.imread(image_path)
  104. # print(image_np.shape)
  105. width = image_np.shape[1]
  106. height = image_np.shape[0]
  107. h_w_rate = height / width
  108. # width_standard = 900
  109. # height_standard = 1400
  110. width_standard = size[1]
  111. height_standard = size[0]
  112. width_new = int(height_standard / h_w_rate)
  113. height_new = int(width_standard * h_w_rate)
  114. if width > width_standard:
  115. image_np = cv2.resize(image_np, (width_standard, height_new))
  116. elif height > height_standard:
  117. image_np = cv2.resize(image_np, (width_new, height_standard))
  118. cv2.imwrite(image_path, image_np)
  119. # print("resize_image", image_np.shape)
  120. return
  121. except Exception as e:
  122. log("resize_image")
  123. print("resize_image", e, global_type)
  124. return
  125. def remove_red_seal(image_np):
  126. """
  127. 去除红色印章
  128. """
  129. # 获得红色通道
  130. blue_c, green_c, red_c = cv2.split(image_np)
  131. # 多传入一个参数cv2.THRESH_OTSU,并且把阈值thresh设为0,算法会找到最优阈值
  132. thresh, ret = cv2.threshold(red_c, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
  133. # print("remove_red_seal thresh", thresh)
  134. # 实测调整为95%效果好一些
  135. filter_condition = int(thresh * 0.98)
  136. thresh1, red_thresh = cv2.threshold(red_c, filter_condition, 255, cv2.THRESH_BINARY)
  137. # 把图片转回 3 通道
  138. image_and = np.expand_dims(red_thresh, axis=2)
  139. image_and = np.concatenate((image_and, image_and, image_and), axis=-1)
  140. # print(image_and.shape)
  141. # 膨胀
  142. gray = cv2.cvtColor(image_and, cv2.COLOR_RGB2GRAY)
  143. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
  144. erode = cv2.erode(gray, kernel)
  145. cv2.imshow("erode", erode)
  146. cv2.waitKey(0)
  147. image_and = np.bitwise_and(cv2.bitwise_not(blue_c), cv2.bitwise_not(erode))
  148. result_img = cv2.bitwise_not(image_and)
  149. cv2.imshow("remove_red_seal", result_img)
  150. cv2.waitKey(0)
  151. return result_img
  152. def remove_underline(image_np):
  153. """
  154. 去除文字下划线
  155. """
  156. # 灰度化
  157. gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
  158. # 二值化
  159. binary = cv2.adaptiveThreshold(~gray, 255,
  160. cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
  161. 15, 10)
  162. # Sobel
  163. kernel_row = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], np.float32)
  164. kernel_col = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], np.float32)
  165. # binary = cv2.filter2D(binary, -1, kernel=kernel)
  166. binary_row = cv2.filter2D(binary, -1, kernel=kernel_row)
  167. binary_col = cv2.filter2D(binary, -1, kernel=kernel_col)
  168. cv2.imshow("custom_blur_demo", binary)
  169. cv2.waitKey(0)
  170. rows, cols = binary.shape
  171. # 识别横线
  172. scale = 5
  173. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1))
  174. erodedcol = cv2.erode(binary_row, kernel, iterations=1)
  175. cv2.imshow("Eroded Image", erodedcol)
  176. cv2.waitKey(0)
  177. dilatedcol = cv2.dilate(erodedcol, kernel, iterations=1)
  178. cv2.imshow("dilate Image", dilatedcol)
  179. cv2.waitKey(0)
  180. return
  181. def getMDFFromFile(path):
  182. _length = 0
  183. try:
  184. _md5 = hashlib.md5()
  185. with open(path, "rb") as ff:
  186. while True:
  187. data = ff.read(4096)
  188. if not data:
  189. break
  190. _length += len(data)
  191. _md5.update(data)
  192. return _md5.hexdigest(), _length
  193. except Exception as e:
  194. traceback.print_exc()
  195. return None, _length
  196. def add_html_format(text_list):
  197. new_text_list = []
  198. for t in text_list:
  199. html_t = "<!DOCTYPE HTML>\n"
  200. html_t += '<head><meta charset="UTF-8"></head>\n'
  201. html_t += "<body>\n"
  202. html_t += t
  203. html_t += "\n</body>\n"
  204. new_text_list.append(html_t)
  205. return new_text_list
  206. if get_platform() == "Windows":
  207. time_out = 1000
  208. else:
  209. time_out = 300
  210. # @timeout_decorator.timeout(100, timeout_exception=TimeoutError)
  211. @timeout(time_out, timeout_exception=TimeoutError, use_signals=False)
  212. def unique_temp_file_process(stream, _type, _md5):
  213. if get_platform() == "Windows":
  214. _global._init()
  215. globals().update({"md5": _md5})
  216. _global.update({"md5": _md5})
  217. log("into unique_temp_file_process")
  218. try:
  219. # 每个调用在temp中创建一个唯一空间
  220. uid1 = uuid.uuid1().hex
  221. unique_space_path = _path + os.sep + "temp" + os.sep + uid1 + os.sep
  222. # unique_space_path = "/mnt/fangjiasheng/" + "temp/" + uid1 + "/"
  223. # 判断冲突
  224. if not os.path.exists(unique_space_path):
  225. if not os.path.exists(_path + os.sep + "temp"):
  226. os.mkdir(_path + os.sep + "temp" + os.sep)
  227. os.mkdir(unique_space_path)
  228. else:
  229. uid2 = uuid.uuid1().hex
  230. if not os.path.exists(_path + os.sep + "temp"):
  231. os.mkdir(_path + os.sep + "temp" + os.sep)
  232. os.mkdir(_path + os.sep + "temp" + os.sep + uid2 + os.sep)
  233. # os.mkdir("/mnt/" + "temp/" + uid2 + "/")
  234. # 在唯一空间中,对传入的文件也保存为唯一
  235. uid3 = uuid.uuid1().hex
  236. file_path = unique_space_path + uid3 + "." + _type
  237. with open(file_path, "wb") as ff:
  238. ff.write(stream)
  239. text = getText(_type, file_path)
  240. # 获取swf转换的图片
  241. swf_images = []
  242. if _type == "swf":
  243. image_name_list = []
  244. for root, dirs, files in os.walk(unique_space_path, topdown=False):
  245. for name in files:
  246. if name[-4:] == ".png" and "resize" not in name:
  247. image_name_list.append(name)
  248. image_name_list.sort(key=lambda x: x)
  249. for name in image_name_list:
  250. with open(os.path.join(unique_space_path, name), "rb") as f:
  251. img_bytes = f.read()
  252. swf_images.append(base64.b64encode(img_bytes))
  253. log("unique_temp_file_process len(swf_images) " + str(len(swf_images)))
  254. return text, swf_images
  255. except Exception as e:
  256. log("unique_temp_file_process failed!")
  257. traceback.print_exc()
  258. return [-1], []
  259. finally:
  260. print("======================================")
  261. try:
  262. if get_platform() == "Linux":
  263. # 删除该唯一空间下所有文件
  264. if os.path.exists(unique_space_path):
  265. shutil.rmtree(unique_space_path)
  266. except Exception as e:
  267. log("Delete Files Failed!")
  268. def cut_str(text_list, only_text_list, max_bytes_length=2000000):
  269. log("into cut_str")
  270. try:
  271. # 计算有格式总字节数
  272. bytes_length = 0
  273. for text in text_list:
  274. bytes_length += len(bytes(text, encoding='utf-8'))
  275. # print("text_list", bytes_length)
  276. # 小于直接返回
  277. if bytes_length < max_bytes_length:
  278. print("return text_list no cut")
  279. return text_list
  280. # 全部文件连接,重新计算无格式字节数
  281. all_text = ""
  282. bytes_length = 0
  283. for text in only_text_list:
  284. bytes_length += len(bytes(text, encoding='utf-8'))
  285. all_text += text
  286. # print("only_text_list", bytes_length)
  287. # 小于直接返回
  288. if bytes_length < max_bytes_length:
  289. print("return only_text_list no cut")
  290. return only_text_list
  291. # 截取字符
  292. all_text = all_text[:int(max_bytes_length/3)]
  293. # print("text bytes ", len(bytes(all_text, encoding='utf-8')))
  294. # print("return only_text_list has cut")
  295. return [all_text]
  296. except Exception as e:
  297. log("cut_str " + str(e))
  298. return ["-1"]
  299. @get_memory_info.memory_decorator
  300. def convert(data, ocr_model, otr_model):
  301. """
  302. 接口返回值:
  303. {[str], 1}: 处理成功
  304. {[-1], 0}: 逻辑处理错误
  305. {[-2], 0}: 接口调用错误
  306. {[-3], 1}: 文件格式错误,无法打开
  307. {[-4], 0}: 各类文件调用第三方包读取超时
  308. {[-5], 0}: 整个转换过程超时
  309. {[-6], 0}: 阿里云UDF队列超时
  310. {[-7], 1}: 文件需密码,无法打开
  311. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  312. """
  313. # 控制内存
  314. # soft, hard = resource.getrlimit(resource.RLIMIT_AS)
  315. # resource.setrlimit(resource.RLIMIT_AS, (15 * 1024 ** 3, hard))
  316. log("into convert")
  317. start_time = time.time()
  318. _md5 = "1000000"
  319. try:
  320. # 模型加入全局变量
  321. globals().update({"global_ocr_model": ocr_model})
  322. globals().update({"global_otr_model": otr_model})
  323. stream = base64.b64decode(data.get("file"))
  324. _type = data.get("type")
  325. _md5 = get_md5_from_bytes(stream)
  326. if get_platform() == "Windows":
  327. # 解除超时装饰器,直接访问原函数
  328. origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  329. text, swf_images = origin_unique_temp_file_process(stream, _type, _md5)
  330. else:
  331. # Linux 通过装饰器设置整个转换超时时间
  332. try:
  333. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  334. except TimeoutError:
  335. log("convert time out! 1200 sec")
  336. text = [-5]
  337. swf_images = []
  338. error_code = [[-x] for x in range(1, 9)]
  339. still_success_code = [[-3], [-7]]
  340. if text in error_code:
  341. if text in still_success_code:
  342. print({"failed result": text, "is_success": 1}, time.time() - start_time)
  343. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  344. "is_success": 1}
  345. else:
  346. print({"failed result": text, "is_success": 0}, time.time() - start_time)
  347. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  348. "is_success": 0}
  349. # 结果保存result.html
  350. if get_platform() == "Windows":
  351. text_str = ""
  352. for t in text:
  353. text_str += t
  354. to_html("../result.html", text_str)
  355. # 取纯文本
  356. only_text = []
  357. for t in text:
  358. new_t = BeautifulSoup(t, "lxml").get_text()
  359. new_t = re.sub("\n", "", new_t)
  360. only_text.append(new_t)
  361. # 判断长度,过长截取
  362. text = cut_str(text, only_text)
  363. only_text = cut_str(only_text, only_text)
  364. if len(only_text) == 0:
  365. only_text = [""]
  366. if only_text[0] == '' and len(only_text) <= 1:
  367. print({"md5: ": str(_md5), "finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  368. else:
  369. print("md5: " + str(_md5), {"finished result": [str(only_text)[:20], len(str(text))],
  370. "is_success": 1}, time.time() - start_time)
  371. return {"result_html": text, "result_text": only_text, "is_success": 1}
  372. except Exception as e:
  373. print({"md5: ": str(_md5), "failed result": [-1], "is_success": 0}, time.time() - start_time)
  374. print("convert", traceback.print_exc())
  375. return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}
  376. # 接口配置
  377. app = Flask(__name__)
  378. @app.route('/convert', methods=['POST'])
  379. def _convert():
  380. """
  381. 接口返回值:
  382. {[str], 1}: 处理成功
  383. {[-1], 0}: 逻辑处理错误
  384. {[-2], 0}: 接口调用错误
  385. {[-3], 1}: 文件格式错误,无法打开
  386. {[-4], 0}: 各类文件调用第三方包读取超时
  387. {[-5], 0}: 整个转换过程超时
  388. {[-6], 0}: 阿里云UDF队列超时
  389. {[-7], 1}: 文件需密码,无法打开
  390. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  391. """
  392. # log("growth start" + str(objgraph.growth()))
  393. # log("most_common_types start" + str(objgraph.most_common_types(20)))
  394. log("into convert")
  395. start_time = time.time()
  396. _md5 = _global.get("md5")
  397. try:
  398. if not request.form:
  399. log("convert no data!")
  400. raise ConnectionError
  401. data = request.form
  402. stream = base64.b64decode(data.get("file"))
  403. _type = data.get("type")
  404. _md5 = get_md5_from_bytes(stream)
  405. _md5 = _md5[0]
  406. _global.update({"md5": _md5})
  407. if get_platform() == "Windows":
  408. # 解除超时装饰器,直接访问原函数
  409. # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  410. # text, swf_images = origin_unique_temp_file_process(stream, _type)
  411. try:
  412. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  413. except TimeoutError:
  414. log("convert time out! 300 sec")
  415. text = [-5]
  416. swf_images = []
  417. else:
  418. # Linux 通过装饰器设置整个转换超时时间
  419. try:
  420. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  421. except TimeoutError:
  422. log("convert time out! 300 sec")
  423. text = [-5]
  424. swf_images = []
  425. if judge_error_code(text):
  426. if judge_error_code(text, [-3, -7]):
  427. is_success = 1
  428. else:
  429. is_success = 0
  430. log("md5: " + str(_md5)
  431. + " finished result: " + str(text)
  432. + " is_success: " + str(is_success)
  433. + " " + str(time.time() - start_time))
  434. return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
  435. "is_success": is_success, "swf_images": str(swf_images)})
  436. # error_code = [[-x] for x in range(1, 9)]
  437. # still_success_code = [[-3], [-7]]
  438. # if text in error_code:
  439. # if text in still_success_code:
  440. # print({"failed result": text, "is_success": 1}, time.time() - start_time)
  441. # log("md5: " + str(_md5) + " finished result: " + str(text) + " is_success: 1 " + str(time.time() - start_time))
  442. # return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
  443. # "is_success": 1, "swf_images": str(swf_images)})
  444. # else:
  445. # print({"failed result": text, "is_success": 0}, time.time() - start_time)
  446. # log("md5: " + str(_md5) + " finished result: " + str(text) + " is_success: 0 " + str(time.time() - start_time))
  447. # return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
  448. # "is_success": 0, "swf_images": str(swf_images)})
  449. # 结果保存result.html
  450. # if get_platform() == "Windows":
  451. text_str = ""
  452. for t in text:
  453. text_str += t
  454. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
  455. # 取纯文本
  456. only_text = []
  457. for t in text:
  458. new_t = BeautifulSoup(t, "lxml").get_text()
  459. new_t = re.sub("\n", "", new_t)
  460. only_text.append(new_t)
  461. # 判断长度,过长截取
  462. text = cut_str(text, only_text)
  463. only_text = cut_str(only_text, only_text)
  464. if len(only_text) == 0:
  465. only_text = [""]
  466. if only_text[0] == '' and len(only_text) <= 1:
  467. print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  468. log("md5: " + str(_md5) + " finished result: ['', 0] is_success: 1 "
  469. + str(time.time() - start_time))
  470. else:
  471. log("md5: " + str(_md5) +
  472. " finished result: " + str(only_text)[:20] + " "
  473. + str(len(str(text))) + " is_success: 1 "
  474. + str(time.time() - start_time))
  475. log("growth end" + str(objgraph.growth()))
  476. log("most_common_types end" + str(objgraph.most_common_types(20)))
  477. return json.dumps({"result_html": text, "result_text": only_text,
  478. "is_success": 1, "swf_images": str(swf_images)})
  479. except ConnectionError:
  480. log("convert post has no data!" + " failed result: [-2] is_success: 0 " +
  481. str(time.time() - start_time))
  482. return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
  483. "is_success": 0, "swf_images": str([])})
  484. except Exception as e:
  485. log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 " +
  486. str(time.time() - start_time))
  487. traceback.print_exc()
  488. return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
  489. "is_success": 0, "swf_images": str([])})
  490. def test_more(_dir, process_no=None):
  491. file_path_list = []
  492. for root, dirs, files in os.walk(_dir, topdown=False):
  493. for name in files:
  494. file_path_list.append(os.path.join(root, name))
  495. start_time = time.time()
  496. i = 0
  497. for p in file_path_list:
  498. if i % 10 == 0:
  499. if process_no is not None:
  500. print("Process", process_no, i, time.time()-start_time)
  501. else:
  502. print("Loop", i, time.time()-start_time)
  503. test_one(p, from_remote=True)
  504. i += 1
  505. def test_one(p, from_remote=False):
  506. with open(p, "rb") as f:
  507. file_bytes = f.read()
  508. file_base64 = base64.b64encode(file_bytes)
  509. data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
  510. if from_remote:
  511. ocr_model = None
  512. otr_model = None
  513. _url = 'http://127.0.0.1:15010/convert'
  514. # _url = 'http://192.168.2.102:15010/convert'
  515. # _url = 'http://172.16.160.65:15010/convert'
  516. result = json.loads(request_post(_url, data, time_out=10000))
  517. if p.split(".")[-1] == "swf":
  518. swf_images = eval(result.get("swf_images"))
  519. print(type(swf_images))
  520. # for img in swf_images:
  521. # img_bytes = base64.b64decode(img)
  522. # img = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR)
  523. # cv2.imshow("swf_images", img)
  524. # cv2.waitKey(0)
  525. else:
  526. ocr_model = ocr_interface.OcrModels().get_model()
  527. otr_model = otr_interface.OtrModels().get_model()
  528. result = convert(data, ocr_model, otr_model)
  529. print("result_text", result.get("result_text")[0][:20])
  530. print("is_success", result.get("is_success"))
  531. def test_duplicate(path_list, process_no=None):
  532. start_time = time.time()
  533. for i in range(500):
  534. if i % 10 == 0:
  535. if process_no is not None:
  536. print("Process", process_no, i*len(path_list), time.time()-start_time)
  537. else:
  538. print("Loop", i*len(path_list), time.time()-start_time)
  539. for p in path_list:
  540. test_one(p, from_remote=True)
  541. global_type = ""
  542. local_url = "http://127.0.0.1"
  543. if get_platform() == "Windows":
  544. _path = os.path.abspath(os.path.dirname(__file__))
  545. else:
  546. _path = "/home/admin"
  547. if not os.path.exists(_path):
  548. _path = os.path.dirname(os.path.abspath(__file__))
  549. if __name__ == '__main__':
  550. # convert interface
  551. if len(sys.argv) == 2:
  552. port = int(sys.argv[1])
  553. else:
  554. port = 15010
  555. globals().update({"md5": "1"+"0"*15})
  556. _global._init()
  557. _global.update({"md5": "1"+"0"*15})
  558. _global.update({"port": str(port)})
  559. ip = get_intranet_ip()
  560. ip_port_dict = get_ip_port()
  561. ip = "http://" + ip
  562. processes = ip_port_dict.get(ip).get("convert_processes")
  563. set_flask_global()
  564. if get_platform() == "Windows":
  565. app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
  566. else:
  567. app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
  568. # if get_platform() == "Windows":
  569. # # file_path = "C:/Users/Administrator/Desktop/error7.jpg"
  570. # # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
  571. # # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
  572. # file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
  573. # else:
  574. # file_path = "test1.doc"
  575. # test_one(file_path, from_remote=True)
  576. # if get_platform() == "Windows":
  577. # file_dir = "D:/BIDI_DOC/比地_文档/table_images/"
  578. # else:
  579. # file_dir = "../table_images/"
  580. #
  581. # for j in range(10):
  582. # p = Process(target=test_more, args=(file_dir, j, ))
  583. # p.start()
  584. # p.join()
  585. # if get_platform() == "Windows":
  586. # # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
  587. # # "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls",
  588. # # "D:/BIDI_DOC/比地_文档/2022/Test_Interface/11111111.rar"]
  589. # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
  590. # "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"]
  591. # # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc"]
  592. #
  593. # else:
  594. # file_path_list = ["test1.pdf"]
  595. # for j in range(10):
  596. # p = Process(target=test_duplicate, args=(file_path_list, j, ))
  597. # p.start()
  598. # p.join()