convert.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905
  1. #-*- coding: utf-8 -*-
  2. import gc
  3. import json
  4. import sys
  5. import os
  6. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  7. # 强制tf使用cpu
  8. os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
  9. from format_convert.utils import judge_error_code, request_post, get_intranet_ip, get_ip_port, get_logger, log, \
  10. set_flask_global, get_md5_from_bytes, memory_decorator
  11. from format_convert.convert_doc import doc2text, DocConvert
  12. from format_convert.convert_docx import docx2text, DocxConvert
  13. from format_convert.convert_image import picture2text, ImageConvert
  14. from format_convert.convert_pdf import pdf2text, PDFConvert
  15. from format_convert.convert_rar import rar2text, RarConvert
  16. from format_convert.convert_swf import swf2text, SwfConvert
  17. from format_convert.convert_txt import txt2text, TxtConvert
  18. from format_convert.convert_xls import xls2text, XlsConvert
  19. from format_convert.convert_xlsx import xlsx2text, XlsxConvert
  20. from format_convert.convert_zip import zip2text, ZipConvert
  21. from format_convert.convert_need_interface import from_atc_interface
  22. import hashlib
  23. from format_convert.judge_platform import get_platform
  24. from ocr import ocr_interface
  25. from otr import otr_interface
  26. import re
  27. import shutil
  28. import base64
  29. import time
  30. import uuid
  31. import logging
  32. from bs4 import BeautifulSoup
  33. from flask import Flask, request, g
  34. import inspect
  35. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  36. from format_convert.table_correct import *
  37. from format_convert.wrapt_timeout_decorator import *
  38. from format_convert import _global
  39. from format_convert.max_compute_config import max_compute
  40. MAX_COMPUTE = max_compute
  41. if get_platform() == "Windows":
  42. globals().update({"time_out": 1000})
  43. else:
  44. globals().update({"time_out": 300})
  45. @memory_decorator
  46. def getText(_type, path_or_stream, time_out=300):
  47. @timeout(time_out, timeout_exception=TimeoutError, use_signals=False)
  48. def get_html_1(_class):
  49. return _class.get_html()
  50. @timeout(600, timeout_exception=TimeoutError, use_signals=False)
  51. def get_html_2(_class):
  52. return _class.get_html()
  53. log("file type - " + _type)
  54. try:
  55. ss = path_or_stream.split(".")
  56. unique_type_dir = ss[-2] + "_" + ss[-1] + os.sep
  57. except:
  58. unique_type_dir = path_or_stream + "_" + _type + os.sep
  59. if _type == "pdf":
  60. if MAX_COMPUTE:
  61. return PDFConvert(path_or_stream, unique_type_dir).get_html()
  62. return get_html_1(PDFConvert(path_or_stream, unique_type_dir))
  63. if _type == "docx":
  64. if MAX_COMPUTE:
  65. return DocxConvert(path_or_stream, unique_type_dir).get_html()
  66. return get_html_1(DocxConvert(path_or_stream, unique_type_dir))
  67. if _type == "zip":
  68. return ZipConvert(path_or_stream, unique_type_dir).get_html()
  69. # return get_html_2(ZipConvert(path_or_stream, unique_type_dir))
  70. if _type == "rar":
  71. return RarConvert(path_or_stream, unique_type_dir).get_html()
  72. # return get_html_2(RarConvert(path_or_stream, unique_type_dir))
  73. if _type == "xlsx":
  74. if MAX_COMPUTE:
  75. return XlsxConvert(path_or_stream, unique_type_dir).get_html()
  76. return get_html_1(XlsxConvert(path_or_stream, unique_type_dir))
  77. if _type == "xls":
  78. if MAX_COMPUTE:
  79. return XlsConvert(path_or_stream, unique_type_dir).get_html()
  80. return get_html_1(XlsConvert(path_or_stream, unique_type_dir))
  81. if _type == "doc":
  82. if MAX_COMPUTE:
  83. return DocConvert(path_or_stream, unique_type_dir).get_html()
  84. return get_html_1(DocConvert(path_or_stream, unique_type_dir))
  85. if _type == "jpg" or _type == "png" or _type == "jpeg":
  86. if MAX_COMPUTE:
  87. return ImageConvert(path_or_stream, unique_type_dir).get_html()
  88. return get_html_1(ImageConvert(path_or_stream, unique_type_dir))
  89. if _type == "swf":
  90. if MAX_COMPUTE:
  91. return SwfConvert(path_or_stream, unique_type_dir).get_html()
  92. return get_html_1(SwfConvert(path_or_stream, unique_type_dir))
  93. if _type == "txt":
  94. if MAX_COMPUTE:
  95. return TxtConvert(path_or_stream, unique_type_dir).get_html()
  96. return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
  97. return [""]
  98. def to_html(path, text):
  99. with open(path, 'w',encoding="utf8") as f:
  100. f.write("<!DOCTYPE HTML>")
  101. f.write('<head><meta charset="UTF-8"></head>')
  102. f.write("<body>")
  103. f.write(text)
  104. f.write("</body>")
  105. def remove_underline(image_np):
  106. """
  107. 去除文字下划线
  108. """
  109. # 灰度化
  110. gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
  111. # 二值化
  112. binary = cv2.adaptiveThreshold(~gray, 255,
  113. cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
  114. 15, 10)
  115. # Sobel
  116. kernel_row = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], np.float32)
  117. kernel_col = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], np.float32)
  118. # binary = cv2.filter2D(binary, -1, kernel=kernel)
  119. binary_row = cv2.filter2D(binary, -1, kernel=kernel_row)
  120. binary_col = cv2.filter2D(binary, -1, kernel=kernel_col)
  121. cv2.imshow("custom_blur_demo", binary)
  122. cv2.waitKey(0)
  123. rows, cols = binary.shape
  124. # 识别横线
  125. scale = 5
  126. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1))
  127. erodedcol = cv2.erode(binary_row, kernel, iterations=1)
  128. cv2.imshow("Eroded Image", erodedcol)
  129. cv2.waitKey(0)
  130. dilatedcol = cv2.dilate(erodedcol, kernel, iterations=1)
  131. cv2.imshow("dilate Image", dilatedcol)
  132. cv2.waitKey(0)
  133. return
  134. # @timeout_decorator.timeout(100, timeout_exception=TimeoutError)
  135. # @timeout(globals().get("time_out"), timeout_exception=TimeoutError, use_signals=False)
  136. def unique_temp_file_process(stream, _type, _md5, time_out=300):
  137. if get_platform() == "Windows":
  138. _global._init()
  139. globals().update({"md5": _md5})
  140. _global.update({"md5": _md5})
  141. log("into unique_temp_file_process")
  142. try:
  143. # 每个调用在temp中创建一个唯一空间
  144. uid1 = uuid.uuid1().hex
  145. unique_space_path = _path + os.sep + "temp" + os.sep + uid1 + os.sep
  146. # unique_space_path = "/mnt/fangjiasheng/" + "temp/" + uid1 + "/"
  147. # 判断冲突
  148. if not os.path.exists(unique_space_path):
  149. if not os.path.exists(_path + os.sep + "temp"):
  150. os.mkdir(_path + os.sep + "temp" + os.sep)
  151. os.mkdir(unique_space_path)
  152. else:
  153. uid2 = uuid.uuid1().hex
  154. if not os.path.exists(_path + os.sep + "temp"):
  155. os.mkdir(_path + os.sep + "temp" + os.sep)
  156. os.mkdir(_path + os.sep + "temp" + os.sep + uid2 + os.sep)
  157. # os.mkdir("/mnt/" + "temp/" + uid2 + "/")
  158. # 在唯一空间中,对传入的文件也保存为唯一
  159. uid3 = uuid.uuid1().hex
  160. file_path = unique_space_path + uid3 + "." + _type
  161. with open(file_path, "wb") as ff:
  162. ff.write(stream)
  163. text = getText(_type, file_path, time_out=time_out)
  164. # 获取swf转换的图片
  165. swf_images = []
  166. if _type == "swf":
  167. image_name_list = []
  168. for root, dirs, files in os.walk(unique_space_path, topdown=False):
  169. for name in files:
  170. if name[-4:] == ".png" and "resize" not in name:
  171. image_name_list.append(name)
  172. image_name_list.sort(key=lambda x: x)
  173. for name in image_name_list:
  174. with open(os.path.join(unique_space_path, name), "rb") as f:
  175. img_bytes = f.read()
  176. swf_images.append(base64.b64encode(img_bytes))
  177. log("unique_temp_file_process len(swf_images) " + str(len(swf_images)))
  178. return text, swf_images
  179. except TimeoutError:
  180. return [-5], []
  181. except Exception as e:
  182. log("unique_temp_file_process failed!")
  183. traceback.print_exc()
  184. return [-1], []
  185. finally:
  186. print("======================================")
  187. try:
  188. if get_platform() == "Linux":
  189. # log("not delete temp file")
  190. # 删除该唯一空间下所有文件
  191. if os.path.exists(unique_space_path):
  192. shutil.rmtree(unique_space_path)
  193. except Exception as e:
  194. log("Delete Files Failed!")
  195. def cut_str(text_list, only_text_list, max_bytes_length=2000000):
  196. log("into cut_str")
  197. try:
  198. # 计算有格式总字节数
  199. bytes_length = 0
  200. for text in text_list:
  201. bytes_length += len(bytes(text, encoding='utf-8'))
  202. # print("text_list", bytes_length)
  203. # 小于直接返回
  204. if bytes_length < max_bytes_length:
  205. print("return text_list no cut")
  206. return text_list
  207. # 全部文件连接,重新计算无格式字节数
  208. all_text = ""
  209. bytes_length = 0
  210. for text in only_text_list:
  211. bytes_length += len(bytes(text, encoding='utf-8'))
  212. all_text += text
  213. # print("only_text_list", bytes_length)
  214. # 小于直接返回
  215. if bytes_length < max_bytes_length:
  216. print("return only_text_list no cut")
  217. return only_text_list
  218. # 截取字符
  219. all_text = all_text[:int(max_bytes_length/3)]
  220. # print("text bytes ", len(bytes(all_text, encoding='utf-8')))
  221. # print("return only_text_list has cut")
  222. return [all_text]
  223. except Exception as e:
  224. log("cut_str " + str(e))
  225. return ["-1"]
  226. @memory_decorator
  227. def convert_maxcompute(data, ocr_model, otr_model):
  228. """
  229. 接口返回值:
  230. {[str], 1}: 处理成功
  231. {[-1], 0}: 逻辑处理错误
  232. {[-2], 0}: 接口调用错误
  233. {[-3], 1}: 文件格式错误,无法打开
  234. {[-4], 0}: 各类文件调用第三方包读取超时
  235. {[-5], 0}: 整个转换过程超时
  236. {[-6], 0}: 阿里云UDF队列超时
  237. {[-7], 1}: 文件需密码,无法打开
  238. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  239. """
  240. # 控制内存
  241. # soft, hard = resource.getrlimit(resource.RLIMIT_AS)
  242. # resource.setrlimit(resource.RLIMIT_AS, (15 * 1024 ** 3, hard))
  243. log("into convert")
  244. start_time = time.time()
  245. _md5 = "1000000"
  246. try:
  247. # 模型加入全局变量
  248. globals().update({"global_ocr_model": ocr_model})
  249. globals().update({"global_otr_model": otr_model})
  250. stream = base64.b64decode(data.get("file"))
  251. _type = data.get("type")
  252. _md5 = get_md5_from_bytes(stream)
  253. if get_platform() == "Windows":
  254. # 解除超时装饰器,直接访问原函数
  255. origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  256. text, swf_images = origin_unique_temp_file_process(stream, _type, _md5)
  257. else:
  258. # Linux 通过装饰器设置整个转换超时时间
  259. try:
  260. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  261. except TimeoutError:
  262. log("convert time out! 1200 sec")
  263. text = [-5]
  264. swf_images = []
  265. error_code = [[-x] for x in range(1, 9)]
  266. still_success_code = [[-3], [-7]]
  267. if text in error_code:
  268. if text in still_success_code:
  269. print({"failed result": text, "is_success": 1}, time.time() - start_time)
  270. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  271. "is_success": 1}
  272. else:
  273. print({"failed result": text, "is_success": 0}, time.time() - start_time)
  274. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  275. "is_success": 0}
  276. # 结果保存result.html
  277. if get_platform() == "Windows":
  278. text_str = ""
  279. for t in text:
  280. text_str += t
  281. to_html("../result.html", text_str)
  282. # 取纯文本
  283. only_text = []
  284. for t in text:
  285. new_t = BeautifulSoup(t, "lxml").get_text()
  286. new_t = re.sub("\n", "", new_t)
  287. only_text.append(new_t)
  288. # 判断长度,过长截取
  289. text = cut_str(text, only_text)
  290. only_text = cut_str(only_text, only_text)
  291. if len(only_text) == 0:
  292. only_text = [""]
  293. if only_text[0] == '' and len(only_text) <= 1:
  294. print({"md5: ": str(_md5), "finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  295. else:
  296. print("md5: " + str(_md5), {"finished result": [str(only_text)[:20], len(str(text))],
  297. "is_success": 1}, time.time() - start_time)
  298. return {"result_html": text, "result_text": only_text, "is_success": 1}
  299. except Exception as e:
  300. print({"md5: ": str(_md5), "failed result": [-1], "is_success": 0}, time.time() - start_time)
  301. print("convert", traceback.print_exc())
  302. return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}
  303. # 接口配置
  304. app = Flask(__name__)
  305. @app.route('/convert', methods=['POST'])
  306. def _convert():
  307. """
  308. 接口返回值:
  309. {[str], 1}: 处理成功
  310. {[-1], 0}: 逻辑处理错误
  311. {[-2], 0}: 接口调用错误
  312. {[-3], 1}: 文件格式错误,无法打开
  313. {[-4], 0}: 各类文件调用第三方包读取超时
  314. {[-5], 0}: 整个转换过程超时
  315. {[-6], 0}: 阿里云UDF队列超时
  316. {[-7], 1}: 文件需密码,无法打开
  317. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  318. """
  319. # log("growth start" + str(objgraph.growth()))
  320. # log("most_common_types start" + str(objgraph.most_common_types(20)))
  321. # tracemalloc.start(25)
  322. # snapshot = tracemalloc.take_snapshot()
  323. _global._init()
  324. _global.update({"md5": "1"+"0"*15})
  325. set_flask_global()
  326. # _global.update({"port": str(port)})
  327. log("into convert")
  328. start_time = time.time()
  329. _md5 = _global.get("md5")
  330. _type = None
  331. try:
  332. _time = time.time()
  333. data = request.form
  334. if not data:
  335. log("convert no data!")
  336. raise ConnectionError
  337. file_path = data.get("file_path")
  338. if file_path is None:
  339. stream = base64.b64decode(data.get("file"))
  340. log("get bytes from file " + str(time.time()-_time))
  341. # 有路径则直接取路径打开文件
  342. else:
  343. with open(file_path, "rb") as f:
  344. stream = f.read()
  345. log("get bytes from file_path " + str(time.time()-_time))
  346. _type = data.get("type")
  347. _md5 = get_md5_from_bytes(stream)
  348. _md5 = _md5[0]
  349. _global.update({"md5": _md5})
  350. if get_platform() == "Windows":
  351. # 解除超时装饰器,直接访问原函数
  352. # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  353. # text, swf_images = origin_unique_temp_file_process(stream, _type)
  354. try:
  355. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  356. except TimeoutError:
  357. log("convert time out! 300 sec")
  358. text = [-5]
  359. swf_images = []
  360. else:
  361. # Linux 通过装饰器设置整个转换超时时间
  362. try:
  363. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  364. except TimeoutError:
  365. log("convert time out! 300 sec")
  366. text = [-5]
  367. swf_images = []
  368. still_success_code = [-3, -4, -7]
  369. if judge_error_code(text):
  370. if judge_error_code(text, still_success_code):
  371. is_success = 1
  372. else:
  373. is_success = 0
  374. log("md5: " + str(_md5)
  375. + " finished result: " + str(text)
  376. + " is_success: " + str(is_success) + " "
  377. + str(_type) + " "
  378. + " " + str(time.time() - start_time))
  379. return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
  380. "is_success": is_success, "swf_images": str(swf_images)})
  381. # 结果保存result.html
  382. # if get_platform() == "Windows":
  383. text_str = ""
  384. for t in text:
  385. text_str += t
  386. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
  387. # 取纯文本
  388. only_text = []
  389. for t in text:
  390. new_t = BeautifulSoup(t, "lxml").get_text()
  391. new_t = re.sub("\n", "", new_t)
  392. only_text.append(new_t)
  393. # 判断附件类型
  394. classification = from_atc_interface(' '.join(only_text))
  395. if judge_error_code(classification):
  396. classification = [str(classification[0])]
  397. # 判断长度,过长截取
  398. text = cut_str(text, only_text)
  399. only_text = cut_str(only_text, only_text)
  400. if len(only_text) == 0:
  401. only_text = [""]
  402. if only_text[0] == '' and len(only_text) <= 1:
  403. print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  404. log("md5: " + str(_md5) + " "
  405. + " finished result: ['', 0] is_success: 1 "
  406. + str(_type) + " "
  407. + str(time.time() - start_time))
  408. else:
  409. log("md5: " + str(_md5) +
  410. " finished result: " + str(only_text)[:20] + " "
  411. + str(len(str(text))) + " is_success: 1 "
  412. + str(_type) + " "
  413. + str(classification) + " "
  414. + str(time.time() - start_time))
  415. # log("growth end" + str(objgraph.growth()))
  416. # log("most_common_types end" + str(objgraph.most_common_types(20)))
  417. return json.dumps({"result_html": text, "result_text": only_text,
  418. "is_success": 1, "swf_images": str(swf_images),
  419. "classification": classification})
  420. except ConnectionError:
  421. log("convert post has no data!" + " failed result: [-2] is_success: 0 "
  422. + str(time.time() - start_time))
  423. return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
  424. "is_success": 0, "swf_images": str([]),
  425. "classification": ""})
  426. except Exception as e:
  427. log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
  428. + str(_type) + " " +
  429. str(time.time() - start_time))
  430. traceback.print_exc()
  431. return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
  432. "is_success": 0, "swf_images": str([]),
  433. "classification": ""})
  434. finally:
  435. # _global._del()
  436. # gc.collect()
  437. log("finally")
  438. # snapshot1 = tracemalloc.take_snapshot()
  439. # top_stats = snapshot1.compare_to(snapshot, 'lineno')
  440. # log("[ Top 20 differences ]")
  441. # for stat in top_stats[:20]:
  442. # if stat.size_diff < 0:
  443. # continue
  444. # log(stat)
  445. # gth = objgraph.growth(limit=10)
  446. # for gt in gth:
  447. # log("growth type:%s, count:%s, growth:%s" % (gt[0], gt[1], gt[2]))
  448. # # if gt[2] > 100 or gt[1] > 300:
  449. # # continue
  450. # if gt[2] < 5:
  451. # continue
  452. # _p = os.path.dirname(os.path.abspath(__file__))
  453. # objgraph.show_backrefs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5,
  454. # filename=_p + "/dots/%s_%s_backrefs.dot" % (_md5, gt[0]))
  455. # objgraph.show_refs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5,
  456. # filename=_p + "/dots/%s_%s_refs.dot" % (_md5, gt[0]))
  457. # objgraph.show_chain(
  458. # objgraph.find_backref_chain(objgraph.by_type(gt[0])[0], objgraph.is_proper_module),
  459. # filename=_p + "/dots/%s_%s_chain.dot" % (_md5, gt[0])
  460. # )
  461. def convert(data):
  462. """
  463. 接口返回值:
  464. {[str], 1}: 处理成功
  465. {[-1], 0}: 逻辑处理错误
  466. {[-2], 0}: 接口调用错误
  467. {[-3], 1}: 文件格式错误,无法打开
  468. {[-4], 0}: 各类文件调用第三方包读取超时
  469. {[-5], 0}: 整个转换过程超时
  470. {[-6], 0}: 阿里云UDF队列超时
  471. {[-7], 1}: 文件需密码,无法打开
  472. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  473. """
  474. _global._init()
  475. _global.update({"md5": "1"+"0"*15})
  476. set_flask_global()
  477. log("into convert")
  478. start_time = time.time()
  479. _md5 = _global.get("md5")
  480. _type = None
  481. try:
  482. _time = time.time()
  483. # 模型加入全局变量
  484. # globals().update({"global_ocr_model": ocr_model})
  485. # globals().update({"global_otr_model": otr_model})
  486. stream = base64.b64decode(data.get("file"))
  487. _type = data.get("type")
  488. _md5 = get_md5_from_bytes(stream)
  489. _md5 = _md5[0]
  490. _global.update({"md5": _md5})
  491. if get_platform() == "Windows":
  492. # 解除超时装饰器,直接访问原函数
  493. # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  494. # text, swf_images = origin_unique_temp_file_process(stream, _type)
  495. try:
  496. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  497. except TimeoutError:
  498. log("convert time out! 300 sec")
  499. text = [-5]
  500. swf_images = []
  501. else:
  502. # Linux 通过装饰器设置整个转换超时时间
  503. try:
  504. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  505. except TimeoutError:
  506. log("convert time out! 300 sec")
  507. text = [-5]
  508. swf_images = []
  509. still_success_code = [-3, -4, -7]
  510. if judge_error_code(text):
  511. if judge_error_code(text, still_success_code):
  512. is_success = 1
  513. else:
  514. is_success = 0
  515. log("md5: " + str(_md5)
  516. + " finished result: " + str(text)
  517. + " is_success: " + str(is_success) + " "
  518. + str(_type) + " "
  519. + " " + str(time.time() - start_time))
  520. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  521. "is_success": is_success, "swf_images": str(swf_images)}
  522. # 结果保存result.html
  523. if not MAX_COMPUTE:
  524. text_str = ""
  525. for t in text:
  526. text_str += t
  527. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
  528. # 取纯文本
  529. only_text = []
  530. for t in text:
  531. new_t = BeautifulSoup(t, "lxml").get_text()
  532. new_t = re.sub("\n", "", new_t)
  533. only_text.append(new_t)
  534. # 判断附件类型
  535. classification = from_atc_interface(' '.join(only_text))
  536. if judge_error_code(classification):
  537. classification = [str(classification[0])]
  538. # 判断长度,过长截取
  539. text = cut_str(text, only_text)
  540. only_text = cut_str(only_text, only_text)
  541. if len(only_text) == 0:
  542. only_text = [""]
  543. if only_text[0] == '' and len(only_text) <= 1:
  544. print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  545. log("md5: " + str(_md5) + " "
  546. + " finished result: ['', 0] is_success: 1 "
  547. + str(_type) + " "
  548. + str(time.time() - start_time))
  549. else:
  550. log("md5: " + str(_md5) +
  551. " finished result: " + str(only_text)[:20] + " "
  552. + str(len(str(text))) + " is_success: 1 "
  553. + str(_type) + " "
  554. + str(classification) + " "
  555. + str(time.time() - start_time))
  556. return {"result_html": text, "result_text": only_text,
  557. "is_success": 1, "swf_images": str(swf_images),
  558. "classification": classification}
  559. except ConnectionError:
  560. log("convert post has no data!" + " failed result: [-2] is_success: 0 "
  561. + str(time.time() - start_time))
  562. return {"result_html": ["-2"], "result_text": ["-2"],
  563. "is_success": 0, "swf_images": str([]),
  564. "classification": ""}
  565. except Exception as e:
  566. log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
  567. + str(_type) + " " +
  568. str(time.time() - start_time))
  569. traceback.print_exc()
  570. return {"result_html": ["-1"], "result_text": ["-1"],
  571. "is_success": 0, "swf_images": str([]),
  572. "classification": ""}
  573. finally:
  574. log("finally")
  575. def convert_old(data, ocr_model, otr_model):
  576. """
  577. 接口返回值:
  578. {[str], 1}: 处理成功
  579. {[-1], 0}: 逻辑处理错误
  580. {[-2], 0}: 接口调用错误
  581. {[-3], 1}: 文件格式错误,无法打开
  582. {[-4], 0}: 各类文件调用第三方包读取超时
  583. {[-5], 0}: 整个转换过程超时
  584. {[-6], 0}: 阿里云UDF队列超时
  585. {[-7], 1}: 文件需密码,无法打开
  586. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  587. """
  588. log("into convert")
  589. _global._init()
  590. _global.update({"md5": "1"+"0"*15})
  591. # set_flask_global()
  592. start_time = time.time()
  593. _md5 = _global.get("md5")
  594. _type = None
  595. try:
  596. # 模型加入全局变量
  597. globals().update({"global_ocr_model": ocr_model})
  598. globals().update({"global_otr_model": otr_model})
  599. _time = time.time()
  600. stream = base64.b64decode(data.get("file"))
  601. _type = data.get("type")
  602. _md5 = get_md5_from_bytes(stream)
  603. _md5 = _md5[0]
  604. _global.update({"md5": _md5})
  605. log("get bytes from file " + str(time.time()-_time))
  606. if get_platform() == "Windows":
  607. try:
  608. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  609. except TimeoutError:
  610. log("convert time out! 300 sec")
  611. text = [-5]
  612. swf_images = []
  613. else:
  614. # Linux 通过装饰器设置整个转换超时时间
  615. try:
  616. text, swf_images = unique_temp_file_process(stream, _type, _md5, time_out=3000)
  617. except TimeoutError:
  618. log("convert time out! 300 sec")
  619. text = [-5]
  620. swf_images = []
  621. still_success_code = [-3, -4, -7]
  622. if judge_error_code(text):
  623. if judge_error_code(text, still_success_code):
  624. is_success = 1
  625. else:
  626. is_success = 0
  627. log("md5: " + str(_md5)
  628. + " finished result: " + str(text)
  629. + " is_success: " + str(is_success) + " "
  630. + str(_type) + " "
  631. + " " + str(time.time() - start_time))
  632. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  633. "is_success": is_success, "swf_images": str(swf_images)}
  634. # 结果保存result.html
  635. text_str = ""
  636. for t in text:
  637. text_str += t
  638. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
  639. # 取纯文本
  640. only_text = []
  641. for t in text:
  642. new_t = BeautifulSoup(t, "lxml").get_text()
  643. new_t = re.sub("\n", "", new_t)
  644. only_text.append(new_t)
  645. # 判断长度,过长截取
  646. text = cut_str(text, only_text)
  647. only_text = cut_str(only_text, only_text)
  648. if len(only_text) == 0:
  649. only_text = [""]
  650. if only_text[0] == '' and len(only_text) <= 1:
  651. print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  652. log("md5: " + str(_md5) + " "
  653. + " finished result: ['', 0] is_success: 1 "
  654. + str(_type) + " "
  655. + str(time.time() - start_time))
  656. else:
  657. log("md5: " + str(_md5) +
  658. " finished result: " + str(only_text)[:20] + " "
  659. + str(len(str(text))) + " is_success: 1 "
  660. + str(_type) + " "
  661. + str(time.time() - start_time))
  662. return {"result_html": text, "result_text": only_text,
  663. "is_success": 1, "swf_images": str(swf_images)}
  664. except ConnectionError:
  665. log("convert post has no data!" + " failed result: [-2] is_success: 0 "
  666. + str(time.time() - start_time))
  667. return {"result_html": ["-2"], "result_text": ["-2"],
  668. "is_success": 0, "swf_images": str([])}
  669. except Exception as e:
  670. log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
  671. + str(_type) + " " +
  672. str(time.time() - start_time))
  673. traceback.print_exc()
  674. return {"result_html": ["-1"], "result_text": ["-1"],
  675. "is_success": 0, "swf_images": str([])}
  676. finally:
  677. log("finally")
  678. def test_more(_dir, process_no=None):
  679. file_path_list = []
  680. for root, dirs, files in os.walk(_dir, topdown=False):
  681. for name in files:
  682. file_path_list.append(os.path.join(root, name))
  683. start_time = time.time()
  684. i = 0
  685. for p in file_path_list:
  686. if i % 10 == 0:
  687. if process_no is not None:
  688. print("Process", process_no, i, time.time()-start_time)
  689. else:
  690. print("Loop", i, time.time()-start_time)
  691. test_one(p, from_remote=True)
  692. i += 1
  693. def test_one(p, from_remote=False):
  694. with open(p, "rb") as f:
  695. file_bytes = f.read()
  696. file_base64 = base64.b64encode(file_bytes)
  697. data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
  698. if from_remote:
  699. ocr_model = None
  700. otr_model = None
  701. _url = 'http://121.46.18.113:15010/convert'
  702. # _url = 'http://192.168.2.102:15010/convert'
  703. # _url = 'http://172.16.160.65:15010/convert'
  704. result = json.loads(request_post(_url, data, time_out=10000))
  705. with open("../result.html", "w") as f:
  706. f.write(result.get("result_text")[0])
  707. if p.split(".")[-1] == "swf":
  708. swf_images = eval(result.get("swf_images"))
  709. print(type(swf_images))
  710. # for img in swf_images:
  711. # img_bytes = base64.b64decode(img)
  712. # img = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR)
  713. # cv2.imshow("swf_images", img)
  714. # cv2.waitKey(0)
  715. else:
  716. ocr_model = ocr_interface.OcrModels().get_model()
  717. otr_model = otr_interface.OtrModels().get_model()
  718. result = convert_maxcompute(data, ocr_model, otr_model)
  719. print("result_text", result.get("result_text")[0][:20])
  720. print("is_success", result.get("is_success"))
  721. def test_duplicate(path_list, process_no=None):
  722. start_time = time.time()
  723. for i in range(500):
  724. if i % 10 == 0:
  725. if process_no is not None:
  726. print("Process", process_no, i*len(path_list), time.time()-start_time)
  727. else:
  728. print("Loop", i*len(path_list), time.time()-start_time)
  729. for p in path_list:
  730. test_one(p, from_remote=True)
  731. global_type = ""
  732. local_url = "http://127.0.0.1"
  733. if get_platform() == "Windows":
  734. _path = os.path.abspath(os.path.dirname(__file__))
  735. else:
  736. _path = "/home/admin"
  737. if not os.path.exists(_path):
  738. _path = os.path.dirname(os.path.abspath(__file__))
  739. if __name__ == '__main__':
  740. # convert interface
  741. if len(sys.argv) == 2:
  742. port = int(sys.argv[1])
  743. else:
  744. port = 15010
  745. globals().update({"md5": "1"+"0"*15})
  746. globals().update({"port": str(port)})
  747. # _global._init()
  748. # _global.update({"md5": "1"+"0"*15})
  749. # _global.update({"port": str(port)})
  750. # ip = get_intranet_ip()
  751. # log("my ip"+str(ip))
  752. # ip = "http://" + ip
  753. ip_port_dict = get_ip_port()
  754. ip = "http://127.0.0.1"
  755. processes = ip_port_dict.get(ip).get("convert_processes")
  756. set_flask_global()
  757. if get_platform() == "Windows":
  758. app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
  759. else:
  760. # app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
  761. app.run(port=15011)
  762. # if get_platform() == "Windows":
  763. # file_path = "C:/Users/Administrator/Desktop/test_image/error29.png"
  764. # # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
  765. # # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
  766. # # file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
  767. # else:
  768. # file_path = "test1.doc"
  769. # test_one(file_path, from_remote=True)
  770. # if get_platform() == "Windows":
  771. # file_dir = "D:/BIDI_DOC/比地_文档/table_images/"
  772. # else:
  773. # file_dir = "../table_images/"
  774. #
  775. # for j in range(10):
  776. # p = Process(target=test_more, args=(file_dir, j, ))
  777. # p.start()
  778. # p.join()
  779. # if get_platform() == "Windows":
  780. # # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
  781. # # "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls",
  782. # # "D:/BIDI_DOC/比地_文档/2022/Test_Interface/11111111.rar"]
  783. # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
  784. # "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"]
  785. # # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc"]
  786. #
  787. # else:
  788. # file_path_list = ["test1.pdf"]
  789. # for j in range(10):
  790. # p = Process(target=test_duplicate, args=(file_path_list, j, ))
  791. # p.start()
  792. # p.join()