convert.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925
  1. #-*- coding: utf-8 -*-
  2. import gc
  3. import json
  4. import sys
  5. import os
  6. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  7. # 强制tf使用cpu
  8. os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
  9. from format_convert.utils import judge_error_code, request_post, get_intranet_ip, get_ip_port, get_logger, log, \
  10. set_flask_global, get_md5_from_bytes, memory_decorator
  11. from format_convert.convert_doc import doc2text, DocConvert
  12. from format_convert.convert_docx import docx2text, DocxConvert
  13. from format_convert.convert_image import picture2text, ImageConvert
  14. from format_convert.convert_pdf import pdf2text, PDFConvert
  15. from format_convert.convert_rar import rar2text, RarConvert
  16. from format_convert.convert_swf import swf2text, SwfConvert
  17. from format_convert.convert_txt import txt2text, TxtConvert
  18. from format_convert.convert_xls import xls2text, XlsConvert
  19. from format_convert.convert_xlsx import xlsx2text, XlsxConvert
  20. from format_convert.convert_zip import zip2text, ZipConvert
  21. from format_convert.convert_need_interface import from_atc_interface
  22. import hashlib
  23. from format_convert.judge_platform import get_platform
  24. from ocr import ocr_interface
  25. from otr import otr_interface
  26. import re
  27. import shutil
  28. import base64
  29. import time
  30. import uuid
  31. import logging
  32. from bs4 import BeautifulSoup
  33. from flask import Flask, request, g
  34. import inspect
  35. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  36. from format_convert.table_correct import *
  37. from format_convert.wrapt_timeout_decorator import *
  38. from format_convert import _global
  39. from config.max_compute_config import MAX_COMPUTE
  40. if get_platform() == "Windows":
  41. globals().update({"time_out": 1000})
  42. else:
  43. globals().update({"time_out": 300})
  44. @memory_decorator
  45. def getText(_type, path_or_stream, _page_no=None, time_out=300):
  46. @timeout(time_out, timeout_exception=TimeoutError, use_signals=False)
  47. def get_html_1(_class):
  48. return _class.get_html()
  49. @timeout(600, timeout_exception=TimeoutError, use_signals=False)
  50. def get_html_2(_class):
  51. return _class.get_html()
  52. log("file type - " + _type + ' page - ' + str(_page_no) + ' time out - ' + str(time_out))
  53. try:
  54. ss = path_or_stream.split(".")
  55. unique_type_dir = ss[-2] + "_" + ss[-1] + os.sep
  56. except:
  57. unique_type_dir = path_or_stream + "_" + _type + os.sep
  58. if _type == "pdf":
  59. if MAX_COMPUTE:
  60. return PDFConvert(path_or_stream, unique_type_dir, _page_no).get_html()
  61. return get_html_1(PDFConvert(path_or_stream, unique_type_dir, _page_no))
  62. if _type == "docx":
  63. if MAX_COMPUTE:
  64. return DocxConvert(path_or_stream, unique_type_dir).get_html()
  65. return get_html_1(DocxConvert(path_or_stream, unique_type_dir))
  66. if _type == "zip":
  67. return ZipConvert(path_or_stream, unique_type_dir, _page_no, time_out).get_html()
  68. # return get_html_2(ZipConvert(path_or_stream, unique_type_dir))
  69. if _type == "rar":
  70. return RarConvert(path_or_stream, unique_type_dir, _page_no, time_out).get_html()
  71. # return get_html_2(RarConvert(path_or_stream, unique_type_dir))
  72. if _type == "xlsx":
  73. if MAX_COMPUTE:
  74. return XlsxConvert(path_or_stream, unique_type_dir).get_html()
  75. return get_html_1(XlsxConvert(path_or_stream, unique_type_dir))
  76. if _type == "xls":
  77. if MAX_COMPUTE:
  78. return XlsConvert(path_or_stream, unique_type_dir).get_html()
  79. return get_html_1(XlsConvert(path_or_stream, unique_type_dir))
  80. if _type == "doc":
  81. if MAX_COMPUTE:
  82. return DocConvert(path_or_stream, unique_type_dir).get_html()
  83. return get_html_1(DocConvert(path_or_stream, unique_type_dir))
  84. if _type == "jpg" or _type == "png" or _type == "jpeg":
  85. if MAX_COMPUTE:
  86. return ImageConvert(path_or_stream, unique_type_dir).get_html()
  87. return get_html_1(ImageConvert(path_or_stream, unique_type_dir))
  88. if _type == "swf":
  89. if MAX_COMPUTE:
  90. return SwfConvert(path_or_stream, unique_type_dir).get_html()
  91. return get_html_1(SwfConvert(path_or_stream, unique_type_dir))
  92. if _type == "txt":
  93. if MAX_COMPUTE:
  94. return TxtConvert(path_or_stream, unique_type_dir).get_html()
  95. return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
  96. return [""]
  97. def to_html(path, text):
  98. with open(path, 'w',encoding="utf8") as f:
  99. f.write("<!DOCTYPE HTML>")
  100. f.write('<head><meta charset="UTF-8"></head>')
  101. f.write("<body>")
  102. f.write(text)
  103. f.write("</body>")
  104. def remove_underline(image_np):
  105. """
  106. 去除文字下划线
  107. """
  108. # 灰度化
  109. gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
  110. # 二值化
  111. binary = cv2.adaptiveThreshold(~gray, 255,
  112. cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
  113. 15, 10)
  114. # Sobel
  115. kernel_row = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], np.float32)
  116. kernel_col = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], np.float32)
  117. # binary = cv2.filter2D(binary, -1, kernel=kernel)
  118. binary_row = cv2.filter2D(binary, -1, kernel=kernel_row)
  119. binary_col = cv2.filter2D(binary, -1, kernel=kernel_col)
  120. cv2.imshow("custom_blur_demo", binary)
  121. cv2.waitKey(0)
  122. rows, cols = binary.shape
  123. # 识别横线
  124. scale = 5
  125. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1))
  126. erodedcol = cv2.erode(binary_row, kernel, iterations=1)
  127. cv2.imshow("Eroded Image", erodedcol)
  128. cv2.waitKey(0)
  129. dilatedcol = cv2.dilate(erodedcol, kernel, iterations=1)
  130. cv2.imshow("dilate Image", dilatedcol)
  131. cv2.waitKey(0)
  132. return
  133. # @timeout_decorator.timeout(100, timeout_exception=TimeoutError)
  134. # @timeout(globals().get("time_out"), timeout_exception=TimeoutError, use_signals=False)
  135. def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300, save_middle=None):
  136. if get_platform() == "Windows":
  137. _global._init()
  138. globals().update({"md5": _md5})
  139. _global.update({"md5": _md5})
  140. log("into unique_temp_file_process")
  141. try:
  142. # 每个调用在temp中创建一个唯一空间
  143. uid1 = uuid.uuid1().hex
  144. unique_space_path = _path + os.sep + "temp" + os.sep + uid1 + os.sep
  145. # unique_space_path = "/mnt/fangjiasheng/" + "temp/" + uid1 + "/"
  146. # 判断冲突
  147. if not os.path.exists(unique_space_path):
  148. if not os.path.exists(_path + os.sep + "temp"):
  149. os.mkdir(_path + os.sep + "temp" + os.sep)
  150. os.mkdir(unique_space_path)
  151. else:
  152. uid2 = uuid.uuid1().hex
  153. if not os.path.exists(_path + os.sep + "temp"):
  154. os.mkdir(_path + os.sep + "temp" + os.sep)
  155. os.mkdir(_path + os.sep + "temp" + os.sep + uid2 + os.sep)
  156. # os.mkdir("/mnt/" + "temp/" + uid2 + "/")
  157. # 在唯一空间中,对传入的文件也保存为唯一
  158. uid3 = uuid.uuid1().hex
  159. file_path = unique_space_path + uid3 + "." + _type
  160. with open(file_path, "wb") as ff:
  161. ff.write(stream)
  162. text = getText(_type, file_path, _page_no, time_out=time_out)
  163. # 获取swf转换的图片
  164. swf_images = []
  165. if _type == "swf":
  166. image_name_list = []
  167. for root, dirs, files in os.walk(unique_space_path, topdown=False):
  168. for name in files:
  169. if name[-4:] == ".png" and "resize" not in name:
  170. image_name_list.append(name)
  171. image_name_list.sort(key=lambda x: x)
  172. for name in image_name_list:
  173. with open(os.path.join(unique_space_path, name), "rb") as f:
  174. img_bytes = f.read()
  175. swf_images.append(base64.b64encode(img_bytes))
  176. log("unique_temp_file_process len(swf_images) " + str(len(swf_images)))
  177. return text, swf_images
  178. except TimeoutError:
  179. return [-5], []
  180. except Exception as e:
  181. log("unique_temp_file_process failed!")
  182. traceback.print_exc()
  183. return [-1], []
  184. finally:
  185. print("======================================")
  186. try:
  187. if get_platform() == "Linux" and save_middle is None:
  188. # log("not delete temp file")
  189. # 删除该唯一空间下所有文件
  190. if os.path.exists(unique_space_path):
  191. shutil.rmtree(unique_space_path)
  192. except Exception as e:
  193. log("Delete Files Failed!")
  194. def cut_str(text_list, only_text_list, max_bytes_length=2000000):
  195. log("into cut_str")
  196. try:
  197. if max_bytes_length and str(max_bytes_length) == '-1':
  198. max_bytes_length = 2000000000000
  199. else:
  200. max_bytes_length = 2000000
  201. # 计算有格式总字节数
  202. bytes_length = 0
  203. for text in text_list:
  204. bytes_length += len(bytes(text, encoding='utf-8'))
  205. # 小于直接返回
  206. if bytes_length < max_bytes_length:
  207. # print("return text_list no cut")
  208. return text_list
  209. # 全部文件连接,重新计算无格式字节数
  210. all_text = ""
  211. bytes_length = 0
  212. for text in only_text_list:
  213. bytes_length += len(bytes(text, encoding='utf-8'))
  214. all_text += text
  215. # 小于直接返回
  216. if bytes_length < max_bytes_length:
  217. print("return only_text_list no cut")
  218. return only_text_list
  219. # 截取字符
  220. all_text = all_text[:int(max_bytes_length/3)]
  221. return [all_text]
  222. except Exception as e:
  223. log("cut_str " + str(e))
  224. return ["-1"]
  225. @memory_decorator
  226. def convert_maxcompute(data, ocr_model, otr_model):
  227. """
  228. 接口返回值:
  229. {[str], 1}: 处理成功
  230. {[-1], 0}: 逻辑处理错误
  231. {[-2], 0}: 接口调用错误
  232. {[-3], 1}: 文件格式错误,无法打开
  233. {[-4], 0}: 各类文件调用第三方包读取超时
  234. {[-5], 0}: 整个转换过程超时
  235. {[-6], 0}: 阿里云UDF队列超时
  236. {[-7], 1}: 文件需密码,无法打开
  237. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  238. """
  239. # 控制内存
  240. # soft, hard = resource.getrlimit(resource.RLIMIT_AS)
  241. # resource.setrlimit(resource.RLIMIT_AS, (15 * 1024 ** 3, hard))
  242. log("into convert")
  243. start_time = time.time()
  244. _md5 = "1000000"
  245. try:
  246. # 模型加入全局变量
  247. globals().update({"global_ocr_model": ocr_model})
  248. globals().update({"global_otr_model": otr_model})
  249. stream = base64.b64decode(data.get("file"))
  250. _type = data.get("type")
  251. _md5 = get_md5_from_bytes(stream)
  252. if get_platform() == "Windows":
  253. # 解除超时装饰器,直接访问原函数
  254. origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  255. text, swf_images = origin_unique_temp_file_process(stream, _type, _md5)
  256. else:
  257. # Linux 通过装饰器设置整个转换超时时间
  258. try:
  259. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  260. except TimeoutError:
  261. log("convert time out! 1200 sec")
  262. text = [-5]
  263. swf_images = []
  264. error_code = [[-x] for x in range(1, 9)]
  265. still_success_code = [[-3], [-7]]
  266. if text in error_code:
  267. if text in still_success_code:
  268. print({"failed result": text, "is_success": 1}, time.time() - start_time)
  269. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  270. "is_success": 1}
  271. else:
  272. print({"failed result": text, "is_success": 0}, time.time() - start_time)
  273. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  274. "is_success": 0}
  275. # 结果保存result.html
  276. if get_platform() == "Windows":
  277. text_str = ""
  278. for t in text:
  279. text_str += t
  280. to_html("../result.html", text_str)
  281. # 取纯文本
  282. only_text = []
  283. for t in text:
  284. new_t = BeautifulSoup(t, "lxml").get_text()
  285. new_t = re.sub("\n", "", new_t)
  286. only_text.append(new_t)
  287. # 判断长度,过长截取
  288. text = cut_str(text, only_text)
  289. only_text = cut_str(only_text, only_text)
  290. if len(only_text) == 0:
  291. only_text = [""]
  292. if only_text[0] == '' and len(only_text) <= 1:
  293. print({"md5: ": str(_md5), "finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  294. else:
  295. print("md5: " + str(_md5), {"finished result": [str(only_text)[:20], len(str(text))],
  296. "is_success": 1}, time.time() - start_time)
  297. return {"result_html": text, "result_text": only_text, "is_success": 1}
  298. except Exception as e:
  299. print({"md5: ": str(_md5), "failed result": [-1], "is_success": 0}, time.time() - start_time)
  300. print("convert", traceback.print_exc())
  301. return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}
  302. # 接口配置
  303. app = Flask(__name__)
  304. @app.route('/convert', methods=['POST'])
  305. def _convert():
  306. """
  307. 接口返回值:
  308. {[str], 1}: 处理成功
  309. {[-1], 0}: 逻辑处理错误
  310. {[-2], 0}: 接口调用错误
  311. {[-3], 1}: 文件格式错误,无法打开
  312. {[-4], 0}: 各类文件调用第三方包读取超时
  313. {[-5], 0}: 整个转换过程超时
  314. {[-6], 0}: 阿里云UDF队列超时
  315. {[-7], 1}: 文件需密码,无法打开
  316. {[-8], 0}: 调用现成接口报错
  317. {[-9], 0}: 接口接收数据为空
  318. {[-10], 0}: 长图分割报错
  319. {[-11], 0}: 新接口idc、isr、atc报错
  320. {[-12], 0}: 表格跨页连接报错
  321. {[-13], 0}: pdf表格线处理报错
  322. {[-14], 0}: 指定页码报错
  323. {[-15], 0}: office转换接口未运行
  324. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  325. """
  326. # log("growth start" + str(objgraph.growth()))
  327. # log("most_common_types start" + str(objgraph.most_common_types(20)))
  328. # tracemalloc.start(25)
  329. # snapshot = tracemalloc.take_snapshot()
  330. _global._init()
  331. _global.update({"md5": "1"+"0"*15})
  332. set_flask_global()
  333. # _global.update({"port": str(port)})
  334. log("into convert")
  335. start_time = time.time()
  336. _md5 = _global.get("md5")
  337. _type = None
  338. try:
  339. _time = time.time()
  340. data = request.form
  341. if not data:
  342. log("convert no data!")
  343. raise ConnectionError
  344. file_path = data.get("file_path")
  345. if file_path is None:
  346. stream = base64.b64decode(data.get("file"))
  347. log("get bytes from file " + str(time.time()-_time))
  348. # 有路径则直接取路径打开文件
  349. else:
  350. with open(file_path, "rb") as f:
  351. stream = f.read()
  352. log("get bytes from file_path " + str(time.time()-_time))
  353. _type = data.get("type")
  354. _md5 = get_md5_from_bytes(stream)
  355. _md5 = _md5[0]
  356. _global.update({"md5": _md5})
  357. # 指定页码范围
  358. _page_no = data.get('page_no')
  359. # if _type not in ['pdf']:
  360. # _page_no = None
  361. # 指定timeout
  362. _timeout = data.get('timeout')
  363. if _timeout is not None:
  364. globals().update({"time_out": _timeout})
  365. # 是否保留中间文件
  366. save_middle = data.get('save_middle')
  367. # 最终结果截取的最大字节数
  368. max_bytes = data.get("max_bytes")
  369. if get_platform() == "Windows":
  370. # 解除超时装饰器,直接访问原函数
  371. # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  372. # text, swf_images = origin_unique_temp_file_process(stream, _type)
  373. try:
  374. text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle)
  375. except TimeoutError:
  376. log("convert time out! 300 sec")
  377. text = [-5]
  378. swf_images = []
  379. else:
  380. # Linux 通过装饰器设置整个转换超时时间
  381. try:
  382. text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle)
  383. except TimeoutError:
  384. log("convert time out! 300 sec")
  385. text = [-5]
  386. swf_images = []
  387. still_success_code = [-3, -4, -7]
  388. if judge_error_code(text):
  389. if judge_error_code(text, still_success_code):
  390. is_success = 1
  391. else:
  392. is_success = 0
  393. log("md5: " + str(_md5)
  394. + " finished result: " + str(text)
  395. + " is_success: " + str(is_success) + " "
  396. + str(_type) + " "
  397. + " " + str(time.time() - start_time))
  398. return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
  399. "is_success": is_success, "swf_images": str(swf_images)})
  400. # 结果保存result.html
  401. # if get_platform() == "Windows":
  402. text_str = ""
  403. for t in text:
  404. text_str += t
  405. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
  406. # 取纯文本
  407. only_text = []
  408. for t in text:
  409. new_t = BeautifulSoup(t, "lxml").get_text()
  410. new_t = re.sub("\n", "", new_t)
  411. only_text.append(new_t)
  412. # 判断附件类型
  413. classification = from_atc_interface(' '.join(only_text))
  414. if judge_error_code(classification):
  415. classification = [str(classification[0])]
  416. # 判断长度,过长截取
  417. text = cut_str(text, only_text, max_bytes)
  418. only_text = cut_str(only_text, only_text)
  419. if len(only_text) == 0:
  420. only_text = [""]
  421. if only_text[0] == '' and len(only_text) <= 1:
  422. print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  423. log("md5: " + str(_md5) + " "
  424. + " finished result: ['', 0] is_success: 1 "
  425. + str(_type) + " "
  426. + str(time.time() - start_time))
  427. else:
  428. log("md5: " + str(_md5) +
  429. " finished result: " + str(only_text)[:20] + " "
  430. + str(len(str(text))) + " is_success: 1 "
  431. + str(_type) + " "
  432. + str(classification) + " "
  433. + str(time.time() - start_time))
  434. # log("growth end" + str(objgraph.growth()))
  435. # log("most_common_types end" + str(objgraph.most_common_types(20)))
  436. return json.dumps({"result_html": text, "result_text": only_text,
  437. "is_success": 1, "swf_images": str(swf_images),
  438. "classification": classification})
  439. except ConnectionError:
  440. log("convert post has no data!" + " failed result: [-2] is_success: 0 "
  441. + str(time.time() - start_time))
  442. return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
  443. "is_success": 0, "swf_images": str([]),
  444. "classification": ""})
  445. except Exception as e:
  446. log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
  447. + str(_type) + " " +
  448. str(time.time() - start_time))
  449. traceback.print_exc()
  450. return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
  451. "is_success": 0, "swf_images": str([]),
  452. "classification": ""})
  453. finally:
  454. # _global._del()
  455. # gc.collect()
  456. log("finally")
  457. # snapshot1 = tracemalloc.take_snapshot()
  458. # top_stats = snapshot1.compare_to(snapshot, 'lineno')
  459. # log("[ Top 20 differences ]")
  460. # for stat in top_stats[:20]:
  461. # if stat.size_diff < 0:
  462. # continue
  463. # log(stat)
  464. # gth = objgraph.growth(limit=10)
  465. # for gt in gth:
  466. # log("growth type:%s, count:%s, growth:%s" % (gt[0], gt[1], gt[2]))
  467. # # if gt[2] > 100 or gt[1] > 300:
  468. # # continue
  469. # if gt[2] < 5:
  470. # continue
  471. # _p = os.path.dirname(os.path.abspath(__file__))
  472. # objgraph.show_backrefs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5,
  473. # filename=_p + "/dots/%s_%s_backrefs.dot" % (_md5, gt[0]))
  474. # objgraph.show_refs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5,
  475. # filename=_p + "/dots/%s_%s_refs.dot" % (_md5, gt[0]))
  476. # objgraph.show_chain(
  477. # objgraph.find_backref_chain(objgraph.by_type(gt[0])[0], objgraph.is_proper_module),
  478. # filename=_p + "/dots/%s_%s_chain.dot" % (_md5, gt[0])
  479. # )
  480. def convert(data):
  481. """
  482. 接口返回值:
  483. {[str], 1}: 处理成功
  484. {[-1], 0}: 逻辑处理错误
  485. {[-2], 0}: 接口调用错误
  486. {[-3], 1}: 文件格式错误,无法打开
  487. {[-4], 0}: 各类文件调用第三方包读取超时
  488. {[-5], 0}: 整个转换过程超时
  489. {[-6], 0}: 阿里云UDF队列超时
  490. {[-7], 1}: 文件需密码,无法打开
  491. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  492. """
  493. _global._init()
  494. _global.update({"md5": "1"+"0"*15})
  495. set_flask_global()
  496. log("into convert")
  497. start_time = time.time()
  498. _md5 = _global.get("md5")
  499. _type = None
  500. try:
  501. _time = time.time()
  502. # 模型加入全局变量
  503. # globals().update({"global_ocr_model": ocr_model})
  504. # globals().update({"global_otr_model": otr_model})
  505. stream = base64.b64decode(data.get("file"))
  506. _type = data.get("type")
  507. _md5 = get_md5_from_bytes(stream)
  508. _md5 = _md5[0]
  509. _page_no = data.get('page_no')
  510. max_bytes = data.get("max_bytes")
  511. _global.update({"md5": _md5})
  512. if get_platform() == "Windows":
  513. # 解除超时装饰器,直接访问原函数
  514. # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  515. # text, swf_images = origin_unique_temp_file_process(stream, _type)
  516. try:
  517. text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
  518. except TimeoutError:
  519. log("convert time out! 300 sec")
  520. text = [-5]
  521. swf_images = []
  522. else:
  523. # Linux 通过装饰器设置整个转换超时时间
  524. try:
  525. text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
  526. except TimeoutError:
  527. log("convert time out! 300 sec")
  528. text = [-5]
  529. swf_images = []
  530. still_success_code = [-3, -4, -7]
  531. if judge_error_code(text):
  532. if judge_error_code(text, still_success_code):
  533. is_success = 1
  534. else:
  535. is_success = 0
  536. log("md5: " + str(_md5)
  537. + " finished result: " + str(text)
  538. + " is_success: " + str(is_success) + " "
  539. + str(_type) + " "
  540. + " " + str(time.time() - start_time))
  541. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  542. "is_success": is_success, "swf_images": str(swf_images)}
  543. # 结果保存result.html
  544. if not MAX_COMPUTE:
  545. text_str = ""
  546. for t in text:
  547. text_str += t
  548. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
  549. # 取纯文本
  550. only_text = []
  551. for t in text:
  552. new_t = BeautifulSoup(t, "lxml").get_text()
  553. new_t = re.sub("\n", "", new_t)
  554. only_text.append(new_t)
  555. # 判断附件类型
  556. classification = from_atc_interface(' '.join(only_text))
  557. if judge_error_code(classification):
  558. classification = [str(classification[0])]
  559. # 判断长度,过长截取
  560. text = cut_str(text, only_text, max_bytes)
  561. only_text = cut_str(only_text, only_text)
  562. if len(only_text) == 0:
  563. only_text = [""]
  564. if only_text[0] == '' and len(only_text) <= 1:
  565. print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  566. log("md5: " + str(_md5) + " "
  567. + " finished result: ['', 0] is_success: 1 "
  568. + str(_type) + " "
  569. + str(time.time() - start_time))
  570. else:
  571. log("md5: " + str(_md5) +
  572. " finished result: " + str(only_text)[:20] + " "
  573. + str(len(str(text))) + " is_success: 1 "
  574. + str(_type) + " "
  575. + str(classification) + " "
  576. + str(time.time() - start_time))
  577. return {"result_html": text, "result_text": only_text,
  578. "is_success": 1, "swf_images": str(swf_images),
  579. "classification": classification}
  580. except ConnectionError:
  581. log("convert post has no data!" + " failed result: [-2] is_success: 0 "
  582. + str(time.time() - start_time))
  583. return {"result_html": ["-2"], "result_text": ["-2"],
  584. "is_success": 0, "swf_images": str([]),
  585. "classification": ""}
  586. except Exception as e:
  587. log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
  588. + str(_type) + " " +
  589. str(time.time() - start_time))
  590. traceback.print_exc()
  591. return {"result_html": ["-1"], "result_text": ["-1"],
  592. "is_success": 0, "swf_images": str([]),
  593. "classification": ""}
  594. finally:
  595. log("finally")
  596. def convert_old(data, ocr_model, otr_model):
  597. """
  598. 接口返回值:
  599. {[str], 1}: 处理成功
  600. {[-1], 0}: 逻辑处理错误
  601. {[-2], 0}: 接口调用错误
  602. {[-3], 1}: 文件格式错误,无法打开
  603. {[-4], 0}: 各类文件调用第三方包读取超时
  604. {[-5], 0}: 整个转换过程超时
  605. {[-6], 0}: 阿里云UDF队列超时
  606. {[-7], 1}: 文件需密码,无法打开
  607. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  608. """
  609. log("into convert")
  610. _global._init()
  611. _global.update({"md5": "1"+"0"*15})
  612. # set_flask_global()
  613. start_time = time.time()
  614. _md5 = _global.get("md5")
  615. _type = None
  616. try:
  617. # 模型加入全局变量
  618. globals().update({"global_ocr_model": ocr_model})
  619. globals().update({"global_otr_model": otr_model})
  620. _time = time.time()
  621. stream = base64.b64decode(data.get("file"))
  622. _type = data.get("type")
  623. _md5 = get_md5_from_bytes(stream)
  624. _md5 = _md5[0]
  625. _global.update({"md5": _md5})
  626. log("get bytes from file " + str(time.time()-_time))
  627. if get_platform() == "Windows":
  628. try:
  629. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  630. except TimeoutError:
  631. log("convert time out! 300 sec")
  632. text = [-5]
  633. swf_images = []
  634. else:
  635. # Linux 通过装饰器设置整个转换超时时间
  636. try:
  637. text, swf_images = unique_temp_file_process(stream, _type, _md5, time_out=3000)
  638. except TimeoutError:
  639. log("convert time out! 300 sec")
  640. text = [-5]
  641. swf_images = []
  642. still_success_code = [-3, -4, -7]
  643. if judge_error_code(text):
  644. if judge_error_code(text, still_success_code):
  645. is_success = 1
  646. else:
  647. is_success = 0
  648. log("md5: " + str(_md5)
  649. + " finished result: " + str(text)
  650. + " is_success: " + str(is_success) + " "
  651. + str(_type) + " "
  652. + " " + str(time.time() - start_time))
  653. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  654. "is_success": is_success, "swf_images": str(swf_images)}
  655. # 结果保存result.html
  656. text_str = ""
  657. for t in text:
  658. text_str += t
  659. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
  660. # 取纯文本
  661. only_text = []
  662. for t in text:
  663. new_t = BeautifulSoup(t, "lxml").get_text()
  664. new_t = re.sub("\n", "", new_t)
  665. only_text.append(new_t)
  666. # 判断长度,过长截取
  667. text = cut_str(text, only_text)
  668. only_text = cut_str(only_text, only_text)
  669. if len(only_text) == 0:
  670. only_text = [""]
  671. if only_text[0] == '' and len(only_text) <= 1:
  672. print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  673. log("md5: " + str(_md5) + " "
  674. + " finished result: ['', 0] is_success: 1 "
  675. + str(_type) + " "
  676. + str(time.time() - start_time))
  677. else:
  678. log("md5: " + str(_md5) +
  679. " finished result: " + str(only_text)[:20] + " "
  680. + str(len(str(text))) + " is_success: 1 "
  681. + str(_type) + " "
  682. + str(time.time() - start_time))
  683. return {"result_html": text, "result_text": only_text,
  684. "is_success": 1, "swf_images": str(swf_images)}
  685. except ConnectionError:
  686. log("convert post has no data!" + " failed result: [-2] is_success: 0 "
  687. + str(time.time() - start_time))
  688. return {"result_html": ["-2"], "result_text": ["-2"],
  689. "is_success": 0, "swf_images": str([])}
  690. except Exception as e:
  691. log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
  692. + str(_type) + " " +
  693. str(time.time() - start_time))
  694. traceback.print_exc()
  695. return {"result_html": ["-1"], "result_text": ["-1"],
  696. "is_success": 0, "swf_images": str([])}
  697. finally:
  698. log("finally")
  699. def test_more(_dir, process_no=None):
  700. file_path_list = []
  701. for root, dirs, files in os.walk(_dir, topdown=False):
  702. for name in files:
  703. file_path_list.append(os.path.join(root, name))
  704. start_time = time.time()
  705. i = 0
  706. for p in file_path_list:
  707. if i % 10 == 0:
  708. if process_no is not None:
  709. print("Process", process_no, i, time.time()-start_time)
  710. else:
  711. print("Loop", i, time.time()-start_time)
  712. test_one(p, from_remote=True)
  713. i += 1
  714. def test_one(p, from_remote=False):
  715. with open(p, "rb") as f:
  716. file_bytes = f.read()
  717. file_base64 = base64.b64encode(file_bytes)
  718. data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
  719. if from_remote:
  720. ocr_model = None
  721. otr_model = None
  722. _url = 'http://121.46.18.113:15010/convert'
  723. # _url = 'http://192.168.2.102:15010/convert'
  724. # _url = 'http://172.16.160.65:15010/convert'
  725. result = json.loads(request_post(_url, data, time_out=10000))
  726. with open("../result.html", "w") as f:
  727. f.write(result.get("result_text")[0])
  728. if p.split(".")[-1] == "swf":
  729. swf_images = eval(result.get("swf_images"))
  730. print(type(swf_images))
  731. # for img in swf_images:
  732. # img_bytes = base64.b64decode(img)
  733. # img = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR)
  734. # cv2.imshow("swf_images", img)
  735. # cv2.waitKey(0)
  736. else:
  737. ocr_model = ocr_interface.OcrModels().get_model()
  738. otr_model = otr_interface.OtrModels().get_model()
  739. result = convert_maxcompute(data, ocr_model, otr_model)
  740. print("result_text", result.get("result_text")[0][:20])
  741. print("is_success", result.get("is_success"))
  742. def test_duplicate(path_list, process_no=None):
  743. start_time = time.time()
  744. for i in range(500):
  745. if i % 10 == 0:
  746. if process_no is not None:
  747. print("Process", process_no, i*len(path_list), time.time()-start_time)
  748. else:
  749. print("Loop", i*len(path_list), time.time()-start_time)
  750. for p in path_list:
  751. test_one(p, from_remote=True)
  752. global_type = ""
  753. local_url = "http://127.0.0.1"
  754. if get_platform() == "Windows":
  755. _path = os.path.abspath(os.path.dirname(__file__))
  756. else:
  757. _path = "/home/admin"
  758. if not os.path.exists(_path):
  759. _path = os.path.dirname(os.path.abspath(__file__))
  760. if __name__ == '__main__':
  761. # convert interface
  762. if len(sys.argv) == 2:
  763. port = int(sys.argv[1])
  764. else:
  765. port = 15010
  766. globals().update({"md5": "1"+"0"*15})
  767. globals().update({"port": str(port)})
  768. # _global._init()
  769. # _global.update({"md5": "1"+"0"*15})
  770. # _global.update({"port": str(port)})
  771. # ip = get_intranet_ip()
  772. # log("my ip"+str(ip))
  773. # ip = "http://" + ip
  774. ip_port_dict = get_ip_port()
  775. set_flask_global()
  776. if get_platform() == "Windows":
  777. app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
  778. else:
  779. # app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
  780. app.run(port=15011)
  781. # if get_platform() == "Windows":
  782. # file_path = "C:/Users/Administrator/Desktop/test_image/error29.png"
  783. # # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
  784. # # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
  785. # # file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
  786. # else:
  787. # file_path = "test1.doc"
  788. # test_one(file_path, from_remote=True)
  789. # if get_platform() == "Windows":
  790. # file_dir = "D:/BIDI_DOC/比地_文档/table_images/"
  791. # else:
  792. # file_dir = "../table_images/"
  793. #
  794. # for j in range(10):
  795. # p = Process(target=test_more, args=(file_dir, j, ))
  796. # p.start()
  797. # p.join()
  798. # if get_platform() == "Windows":
  799. # # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
  800. # # "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls",
  801. # # "D:/BIDI_DOC/比地_文档/2022/Test_Interface/11111111.rar"]
  802. # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
  803. # "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"]
  804. # # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc"]
  805. #
  806. # else:
  807. # file_path_list = ["test1.pdf"]
  808. # for j in range(10):
  809. # p = Process(target=test_duplicate, args=(file_path_list, j, ))
  810. # p.start()
  811. # p.join()