convert.py 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094
  1. # -*- coding: utf-8 -*-
  2. import gc
  3. import json
  4. import sys
  5. import os
  6. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  7. # 强制tf使用cpu
  8. os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
  9. # 动态添加 VERSION 属性到 Image 类
  10. import PIL
  11. from PIL import Image
  12. Image.VERSION = PIL.__version__
  13. from format_convert.utils import judge_error_code, request_post, get_intranet_ip, get_ip_port, get_logger, log, \
  14. set_flask_global, get_md5_from_bytes, memory_decorator, register_all_fonts
  15. # 调用函数注册字体
  16. # register_all_fonts("/usr/share/fonts/opentype/noto/")
  17. # register_all_fonts("/usr/share/fonts/truetype/arphic")
  18. # register_all_fonts("/usr/share/fonts/")
  19. from format_convert.convert_doc import doc2text, DocConvert
  20. from format_convert.convert_docx import docx2text, DocxConvert
  21. from format_convert.convert_image import picture2text, ImageConvert
  22. from format_convert.convert_pdf import pdf2text, PDFConvert
  23. from format_convert.convert_rar import rar2text, RarConvert
  24. from format_convert.convert_swf import swf2text, SwfConvert
  25. from format_convert.convert_txt import txt2text, TxtConvert
  26. from format_convert.convert_xls import xls2text, XlsConvert
  27. from format_convert.convert_xlsx import xlsx2text, XlsxConvert
  28. from format_convert.convert_zip import zip2text, ZipConvert
  29. from format_convert.convert_wps import WpsConvert
  30. from format_convert.convert_ofd import OfdConvert
  31. from format_convert.convert_need_interface import from_atc_interface
  32. import hashlib
  33. from format_convert.judge_platform import get_platform
  34. from ocr import ocr_interface
  35. from otr import otr_interface
  36. import re
  37. import shutil
  38. import base64
  39. import time
  40. import uuid
  41. import logging
  42. from bs4 import BeautifulSoup
  43. from flask import Flask, request, g
  44. import inspect
  45. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  46. from format_convert.table_correct import *
  47. from format_convert.wrapt_timeout_decorator import *
  48. from format_convert import _global
  49. from config.max_compute_config import MAX_COMPUTE
  50. support_file_types = [
  51. 'txt',
  52. 'pdf',
  53. 'doc',
  54. 'docx',
  55. 'xls',
  56. 'xlsx',
  57. 'zip',
  58. 'rar',
  59. 'jpg',
  60. 'png',
  61. 'jpeg',
  62. 'swf',
  63. 'wps',
  64. ]
  65. if get_platform() == "Windows":
  66. globals().update({"time_out": 1000})
  67. else:
  68. globals().update({"time_out": 300})
  69. @memory_decorator
  70. def getText(_type, path_or_stream, _page_no=None, time_out=300):
  71. @timeout(time_out, timeout_exception=TimeoutError, use_signals=False)
  72. def get_html_1(_class):
  73. return _class.get_html()
  74. @timeout(600, timeout_exception=TimeoutError, use_signals=False)
  75. def get_html_2(_class):
  76. return _class.get_html()
  77. log("file type - " + _type + ' page - ' + str(_page_no) + ' time out - ' + str(time_out))
  78. try:
  79. ss = path_or_stream.split(".")
  80. unique_type_dir = ss[-2] + "_" + ss[-1] + os.sep
  81. except:
  82. unique_type_dir = path_or_stream + "_" + _type + os.sep
  83. if not os.path.exists(unique_type_dir):
  84. os.mkdir(unique_type_dir)
  85. if _type == "pdf":
  86. if MAX_COMPUTE:
  87. return PDFConvert(path_or_stream, unique_type_dir, _page_no).get_html()
  88. return get_html_1(PDFConvert(path_or_stream, unique_type_dir, _page_no))
  89. if _type == "docx":
  90. if MAX_COMPUTE:
  91. return DocxConvert(path_or_stream, unique_type_dir).get_html()
  92. return get_html_1(DocxConvert(path_or_stream, unique_type_dir))
  93. if _type == "zip":
  94. return ZipConvert(path_or_stream, unique_type_dir, _page_no, time_out).get_html()
  95. # return get_html_2(ZipConvert(path_or_stream, unique_type_dir))
  96. if _type == "rar":
  97. return RarConvert(path_or_stream, unique_type_dir, _page_no, time_out).get_html()
  98. # return get_html_2(RarConvert(path_or_stream, unique_type_dir))
  99. if _type == "xlsx":
  100. if MAX_COMPUTE:
  101. return XlsxConvert(path_or_stream, unique_type_dir).get_html()
  102. return get_html_1(XlsxConvert(path_or_stream, unique_type_dir))
  103. if _type == "xls":
  104. if MAX_COMPUTE:
  105. return XlsConvert(path_or_stream, unique_type_dir).get_html()
  106. return get_html_1(XlsConvert(path_or_stream, unique_type_dir))
  107. if _type == "doc":
  108. if MAX_COMPUTE:
  109. return DocConvert(path_or_stream, unique_type_dir).get_html()
  110. return get_html_1(DocConvert(path_or_stream, unique_type_dir))
  111. if _type == "jpg" or _type == "png" or _type == "jpeg":
  112. if MAX_COMPUTE:
  113. return ImageConvert(path_or_stream, unique_type_dir).get_html()
  114. return get_html_1(ImageConvert(path_or_stream, unique_type_dir))
  115. if _type == "swf":
  116. if MAX_COMPUTE:
  117. return SwfConvert(path_or_stream, unique_type_dir).get_html()
  118. return get_html_1(SwfConvert(path_or_stream, unique_type_dir))
  119. if _type == "txt":
  120. if MAX_COMPUTE:
  121. return TxtConvert(path_or_stream, unique_type_dir).get_html()
  122. return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
  123. if _type == "wps":
  124. if MAX_COMPUTE:
  125. return WpsConvert(path_or_stream, unique_type_dir).get_html()
  126. return get_html_1(WpsConvert(path_or_stream, unique_type_dir))
  127. if _type == "ofd":
  128. if MAX_COMPUTE:
  129. return OfdConvert(path_or_stream, unique_type_dir).get_html()
  130. return get_html_1(OfdConvert(path_or_stream, unique_type_dir))
  131. return [""]
  132. def to_html(path, text):
  133. with open(path, 'w', encoding="utf8") as f:
  134. f.write("<!DOCTYPE HTML>")
  135. f.write('<head><meta charset="UTF-8"></head>')
  136. f.write("<body>")
  137. f.write(text)
  138. f.write("</body>")
  139. def remove_underline(image_np):
  140. """
  141. 去除文字下划线
  142. """
  143. # 灰度化
  144. gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
  145. # 二值化
  146. binary = cv2.adaptiveThreshold(~gray, 255,
  147. cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
  148. 15, 10)
  149. # Sobel
  150. kernel_row = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], np.float32)
  151. kernel_col = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], np.float32)
  152. # binary = cv2.filter2D(binary, -1, kernel=kernel)
  153. binary_row = cv2.filter2D(binary, -1, kernel=kernel_row)
  154. binary_col = cv2.filter2D(binary, -1, kernel=kernel_col)
  155. cv2.imshow("custom_blur_demo", binary)
  156. cv2.waitKey(0)
  157. rows, cols = binary.shape
  158. # 识别横线
  159. scale = 5
  160. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1))
  161. erodedcol = cv2.erode(binary_row, kernel, iterations=1)
  162. cv2.imshow("Eroded Image", erodedcol)
  163. cv2.waitKey(0)
  164. dilatedcol = cv2.dilate(erodedcol, kernel, iterations=1)
  165. cv2.imshow("dilate Image", dilatedcol)
  166. cv2.waitKey(0)
  167. return
  168. # @timeout_decorator.timeout(100, timeout_exception=TimeoutError)
  169. # @timeout(globals().get("time_out"), timeout_exception=TimeoutError, use_signals=False)
  170. def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300, save_middle=None):
  171. if get_platform() == "Windows":
  172. _global._init()
  173. if MAX_COMPUTE:
  174. _path = "/home/admin"
  175. else:
  176. _path = os.path.dirname(os.path.abspath(__file__))
  177. globals().update({"md5": _md5})
  178. _global.update({"md5": _md5})
  179. log("into unique_temp_file_process")
  180. try:
  181. # 每个调用在temp中创建一个唯一空间
  182. uid1 = uuid.uuid1().hex
  183. unique_space_path = _path + os.sep + "temp" + os.sep + uid1 + os.sep
  184. # unique_space_path = "/mnt/fangjiasheng/" + "temp/" + uid1 + "/"
  185. # 判断冲突
  186. if not os.path.exists(unique_space_path):
  187. if not os.path.exists(_path + os.sep + "temp"):
  188. os.mkdir(_path + os.sep + "temp" + os.sep)
  189. os.mkdir(unique_space_path)
  190. else:
  191. uid2 = uuid.uuid1().hex
  192. if not os.path.exists(_path + os.sep + "temp"):
  193. os.mkdir(_path + os.sep + "temp" + os.sep)
  194. os.mkdir(_path + os.sep + "temp" + os.sep + uid2 + os.sep)
  195. # os.mkdir("/mnt/" + "temp/" + uid2 + "/")
  196. # 在唯一空间中,对传入的文件也保存为唯一
  197. uid3 = uuid.uuid1().hex
  198. file_path = unique_space_path + uid3 + "." + _type
  199. with open(file_path, "wb") as ff:
  200. ff.write(stream)
  201. text = getText(_type, file_path, _page_no, time_out=time_out)
  202. # 获取swf转换的图片
  203. swf_images = []
  204. if _type == "swf":
  205. image_name_list = []
  206. for root, dirs, files in os.walk(unique_space_path, topdown=False):
  207. for name in files:
  208. if name[-4:] == ".png" and "resize" not in name:
  209. image_name_list.append(name)
  210. image_name_list.sort(key=lambda x: x)
  211. for name in image_name_list:
  212. with open(os.path.join(unique_space_path, name), "rb") as f:
  213. img_bytes = f.read()
  214. swf_images.append(base64.b64encode(img_bytes))
  215. log("unique_temp_file_process len(swf_images) " + str(len(swf_images)))
  216. return text, swf_images
  217. except TimeoutError:
  218. return [-5], []
  219. except Exception as e:
  220. log("unique_temp_file_process failed!")
  221. traceback.print_exc()
  222. return [-1], []
  223. finally:
  224. print("======================================")
  225. try:
  226. if get_platform() == "Linux" and save_middle is None:
  227. # log("not delete temp file")
  228. # 删除该唯一空间下所有文件
  229. if os.path.exists(unique_space_path):
  230. shutil.rmtree(unique_space_path)
  231. except Exception as e:
  232. log("Delete Files Failed!")
  233. def cut_str(text_list, only_text_list, max_bytes_length=2000000):
  234. log("into cut_str")
  235. try:
  236. if max_bytes_length and str(max_bytes_length) == '-1':
  237. max_bytes_length = 2000000000000
  238. else:
  239. max_bytes_length = 2000000
  240. # 计算有格式总字节数
  241. bytes_length = 0
  242. for text in text_list:
  243. bytes_length += len(bytes(text, encoding='utf-8'))
  244. # 小于直接返回
  245. if bytes_length < max_bytes_length:
  246. # print("return text_list no cut")
  247. return text_list
  248. # 全部文件连接,重新计算无格式字节数
  249. all_text = ""
  250. bytes_length = 0
  251. for text in only_text_list:
  252. bytes_length += len(bytes(text, encoding='utf-8'))
  253. all_text += text
  254. # 小于直接返回
  255. if bytes_length < max_bytes_length:
  256. print("return only_text_list no cut")
  257. return only_text_list
  258. # 截取字符
  259. all_text = all_text[:int(max_bytes_length / 3)]
  260. return [all_text]
  261. except Exception as e:
  262. log("cut_str " + str(e))
  263. return ["-1"]
  264. @memory_decorator
  265. def convert_maxcompute(data, ocr_model, otr_model):
  266. """
  267. 接口返回值:
  268. {[str], 1}: 处理成功
  269. {[-1], 0}: 逻辑处理错误
  270. {[-2], 0}: 接口调用错误
  271. {[-3], 1}: 文件格式错误,无法打开
  272. {[-4], 0}: 各类文件调用第三方包读取超时
  273. {[-5], 0}: 整个转换过程超时
  274. {[-6], 0}: 阿里云UDF队列超时
  275. {[-7], 1}: 文件需密码,无法打开
  276. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  277. """
  278. # 控制内存
  279. # soft, hard = resource.getrlimit(resource.RLIMIT_AS)
  280. # resource.setrlimit(resource.RLIMIT_AS, (15 * 1024 ** 3, hard))
  281. log("into convert")
  282. start_time = time.time()
  283. _md5 = "1000000"
  284. try:
  285. # 模型加入全局变量
  286. globals().update({"global_ocr_model": ocr_model})
  287. globals().update({"global_otr_model": otr_model})
  288. stream = base64.b64decode(data.get("file"))
  289. _type = data.get("type")
  290. _md5 = get_md5_from_bytes(stream)
  291. if get_platform() == "Windows":
  292. # 解除超时装饰器,直接访问原函数
  293. origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  294. text, swf_images = origin_unique_temp_file_process(stream, _type, _md5)
  295. else:
  296. # Linux 通过装饰器设置整个转换超时时间
  297. try:
  298. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  299. except TimeoutError:
  300. log("convert time out! 1200 sec")
  301. text = [-5]
  302. swf_images = []
  303. error_code = [[-x] for x in range(1, 9)]
  304. still_success_code = [[-3], [-7]]
  305. if text in error_code:
  306. if text in still_success_code:
  307. print({"failed result": text, "is_success": 1}, time.time() - start_time)
  308. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  309. "is_success": 1}
  310. else:
  311. print({"failed result": text, "is_success": 0}, time.time() - start_time)
  312. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  313. "is_success": 0}
  314. # 结果保存result.html
  315. if get_platform() == "Windows":
  316. text_str = ""
  317. for t in text:
  318. text_str += t
  319. to_html("../result.html", text_str)
  320. # 取纯文本
  321. only_text = []
  322. for t in text:
  323. new_t = BeautifulSoup(t, "lxml").get_text()
  324. new_t = re.sub("\n", "", new_t)
  325. only_text.append(new_t)
  326. # 判断长度,过长截取
  327. text = cut_str(text, only_text)
  328. only_text = cut_str(only_text, only_text)
  329. if len(only_text) == 0:
  330. only_text = [""]
  331. if only_text[0] == '' and len(only_text) <= 1:
  332. print({"md5: ": str(_md5), "finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  333. else:
  334. print("md5: " + str(_md5), {"finished result": [str(only_text)[:20], len(str(text))],
  335. "is_success": 1}, time.time() - start_time)
  336. return {"result_html": text, "result_text": only_text, "is_success": 1}
  337. except Exception as e:
  338. print({"md5: ": str(_md5), "failed result": [-1], "is_success": 0}, time.time() - start_time)
  339. print("convert", traceback.print_exc())
  340. return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}
  341. # 接口配置
  342. app = Flask(__name__)
  343. @app.route('/convert', methods=['POST'])
  344. def _convert():
  345. try:
  346. data = request.form
  347. except Exception:
  348. log_convert_result("1" + "0" * 15, [-1], "", 0,
  349. None, None, time.time())
  350. traceback.print_exc()
  351. return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
  352. "is_success": 0, "swf_images": str([]),
  353. "classification": ""})
  354. result = convert(data)
  355. return result
  356. def _convert_old_250613():
  357. """
  358. 接口返回值:
  359. {[str], 1}: 处理成功
  360. {[-1], 0}: 逻辑处理错误
  361. {[-2], 0}: 接口调用错误
  362. {[-3], 1}: 文件格式错误,无法打开
  363. {[-4], 0}: 各类文件调用第三方包读取超时
  364. {[-5], 0}: 整个转换过程超时
  365. {[-6], 0}: 阿里云UDF队列超时
  366. {[-7], 1}: 文件需密码,无法打开
  367. {[-8], 0}: 调用现成接口报错
  368. {[-9], 0}: 接口接收数据为空
  369. {[-10], 0}: 长图分割报错
  370. {[-11], 0}: 新接口idc、isr、atc报错
  371. {[-12], 0}: 表格跨页连接报错
  372. {[-13], 0}: pdf表格线处理报错
  373. {[-14], 0}: 指定页码报错
  374. {[-15], 0}: office转换接口未运行
  375. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  376. """
  377. # log("growth start" + str(objgraph.growth()))
  378. # log("most_common_types start" + str(objgraph.most_common_types(20)))
  379. # tracemalloc.start(25)
  380. # snapshot = tracemalloc.take_snapshot()
  381. _global._init()
  382. _global.update({"md5": "1" + "0" * 15})
  383. set_flask_global()
  384. # _global.update({"port": str(port)})
  385. log("into _convert")
  386. start_time = time.time()
  387. _md5 = _global.get("md5")
  388. _type = None
  389. try:
  390. _time = time.time()
  391. data = request.form
  392. if not data:
  393. log("convert no data!")
  394. raise ConnectionError
  395. file_path = data.get("file_path")
  396. if file_path is None:
  397. stream = base64.b64decode(data.get("file"))
  398. log("get bytes from file " + str(time.time() - _time))
  399. # 有路径则直接取路径打开文件
  400. else:
  401. with open(file_path, "rb") as f:
  402. stream = f.read()
  403. log("get bytes from file_path " + str(time.time() - _time))
  404. _type = data.get("type")
  405. _md5 = get_md5_from_bytes(stream)
  406. _md5 = _md5[0]
  407. _global.update({"md5": _md5})
  408. # 指定页码范围
  409. _page_no = data.get('page_no')
  410. # if _type not in ['pdf']:
  411. # _page_no = None
  412. # 指定timeout
  413. _timeout = data.get('timeout')
  414. if _timeout is not None:
  415. globals().update({"time_out": _timeout})
  416. # 是否保留中间文件
  417. save_middle = data.get('save_middle')
  418. # 最终结果截取的最大字节数
  419. max_bytes = data.get("max_bytes")
  420. if get_platform() == "Windows":
  421. # 解除超时装饰器,直接访问原函数
  422. # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  423. # text, swf_images = origin_unique_temp_file_process(stream, _type)
  424. try:
  425. text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
  426. time_out=globals().get('time_out'), save_middle=save_middle)
  427. except TimeoutError:
  428. log("convert time out! 300 sec")
  429. text = [-5]
  430. swf_images = []
  431. else:
  432. # Linux 通过装饰器设置整个转换超时时间
  433. try:
  434. text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
  435. time_out=globals().get('time_out'), save_middle=save_middle)
  436. except TimeoutError:
  437. log("convert time out! 300 sec")
  438. text = [-5]
  439. swf_images = []
  440. still_success_code = [-3, -4, -7]
  441. if judge_error_code(text):
  442. if judge_error_code(text, still_success_code):
  443. is_success = 1
  444. else:
  445. is_success = 0
  446. log("md5: " + str(_md5) + " "
  447. + "finished result: " + str(text) + " "
  448. + "is_success: " + str(is_success) + " "
  449. + str(_type) + " "
  450. + 'None '
  451. + str(round(time.time() - start_time, 2)))
  452. return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
  453. "is_success": is_success, "swf_images": str(swf_images)})
  454. # 结果保存result.html
  455. # if get_platform() == "Windows":
  456. text_str = ""
  457. for t in text:
  458. text_str += t
  459. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
  460. # 取纯文本
  461. only_text = []
  462. for t in text:
  463. new_t = BeautifulSoup(t, "lxml").get_text()
  464. new_t = re.sub("\n", "", new_t)
  465. only_text.append(new_t)
  466. # 判断附件类型
  467. classification = from_atc_interface(' '.join(only_text))
  468. if judge_error_code(classification):
  469. classification = [str(classification[0])]
  470. # 判断长度,过长截取
  471. text = cut_str(text, only_text, max_bytes)
  472. only_text = cut_str(only_text, only_text)
  473. if len(only_text) == 0:
  474. only_text = [""]
  475. if only_text[0] == '' and len(only_text) <= 1:
  476. print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  477. log("md5: " + str(_md5) + " "
  478. + "finished result: ['', 0] is_success: 1 "
  479. + str(_type) + " "
  480. + 'None '
  481. + str(round(time.time() - start_time, 2)))
  482. else:
  483. log("md5: " + str(_md5) + " "
  484. + "finished result: " + str(only_text)[:20] + " "
  485. + str(len(str(text))) + " is_success: 1 "
  486. + str(_type) + " "
  487. + str(classification) + " "
  488. + str(round(time.time() - start_time, 2)))
  489. # log("growth end" + str(objgraph.growth()))
  490. # log("most_common_types end" + str(objgraph.most_common_types(20)))
  491. return json.dumps({"result_html": text, "result_text": only_text,
  492. "is_success": 1, "swf_images": str(swf_images),
  493. "classification": classification})
  494. except ConnectionError:
  495. # log("convert post has no data!" + " failed result: [-2] is_success: 0 "
  496. # + str(round(time.time() - start_time, 2)))
  497. log("md5: " + str(_md5) + " "
  498. + "failed result: [-2] is_success: 0 "
  499. + str(_type) + " "
  500. + "None "
  501. + str(round(time.time() - start_time, 2))
  502. )
  503. return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
  504. "is_success": 0, "swf_images": str([]),
  505. "classification": ""})
  506. except Exception as e:
  507. log("md5: " + str(_md5) + " "
  508. + "failed result: [-1] is_success: 0 "
  509. + str(_type) + " "
  510. + "None "
  511. + str(round(time.time() - start_time, 2))
  512. )
  513. traceback.print_exc()
  514. return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
  515. "is_success": 0, "swf_images": str([]),
  516. "classification": ""})
  517. finally:
  518. # _global._del()
  519. # gc.collect()
  520. log("finally")
  521. # snapshot1 = tracemalloc.take_snapshot()
  522. # top_stats = snapshot1.compare_to(snapshot, 'lineno')
  523. # log("[ Top 20 differences ]")
  524. # for stat in top_stats[:20]:
  525. # if stat.size_diff < 0:
  526. # continue
  527. # log(stat)
  528. # gth = objgraph.growth(limit=10)
  529. # for gt in gth:
  530. # log("growth type:%s, count:%s, growth:%s" % (gt[0], gt[1], gt[2]))
  531. # # if gt[2] > 100 or gt[1] > 300:
  532. # # continue
  533. # if gt[2] < 5:
  534. # continue
  535. # _p = os.path.dirname(os.path.abspath(__file__))
  536. # objgraph.show_backrefs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5,
  537. # filename=_p + "/dots/%s_%s_backrefs.dot" % (_md5, gt[0]))
  538. # objgraph.show_refs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5,
  539. # filename=_p + "/dots/%s_%s_refs.dot" % (_md5, gt[0]))
  540. # objgraph.show_chain(
  541. # objgraph.find_backref_chain(objgraph.by_type(gt[0])[0], objgraph.is_proper_module),
  542. # filename=_p + "/dots/%s_%s_chain.dot" % (_md5, gt[0])
  543. # )
  544. def convert(data):
  545. """
  546. 接口返回值:
  547. :return: {"result_html": [str], "result_text": [str],
  548. "is_success": int, "swf_images": str(list)}
  549. """
  550. log("into convert")
  551. start_time = time.time()
  552. # 初始化
  553. _global._init()
  554. _global.update({"md5": "1" + "0" * 15})
  555. set_flask_global()
  556. # 文件md5
  557. _md5 = _global.get("md5")
  558. # 文件类型
  559. _type = None
  560. try:
  561. if not data:
  562. log("convert no data!")
  563. raise ConnectionError
  564. file_path = data.get("file_path")
  565. if file_path is None:
  566. stream = base64.b64decode(data.get("file"))
  567. log("get bytes from file " + str(time.time() - start_time))
  568. # 有路径则直接取路径打开文件
  569. else:
  570. with open(file_path, "rb") as f:
  571. stream = f.read()
  572. log("get bytes from file_path " + str(time.time() - start_time))
  573. # 获取真实值
  574. _type = data.get("type")
  575. _md5 = get_md5_from_bytes(stream)
  576. _md5 = _md5[0]
  577. _global.update({"md5": _md5})
  578. # 指定页码范围
  579. _page_no = data.get('page_no')
  580. # 指定timeout
  581. _timeout = data.get('timeout')
  582. if _timeout is not None:
  583. globals().update({"time_out": _timeout})
  584. # 是否保留中间文件
  585. save_middle = data.get('save_middle')
  586. # 最终结果截取的最大字节数
  587. max_bytes = data.get("max_bytes")
  588. # 开始转换,并且控制时间
  589. try:
  590. text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
  591. time_out=globals().get('time_out'), save_middle=save_middle)
  592. except TimeoutError:
  593. log("convert time out! 300 sec")
  594. text = [-5]
  595. swf_images = []
  596. # 报错依然成功的
  597. still_success_code = [-3, -4, -7]
  598. if judge_error_code(text):
  599. if judge_error_code(text, still_success_code):
  600. is_success = 1
  601. else:
  602. is_success = 0
  603. log_convert_result(_md5, text, "", is_success,
  604. _type, None, start_time)
  605. return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
  606. "is_success": is_success, "swf_images": str(swf_images)})
  607. # 结果保存result.html
  608. text_str = ""
  609. for t in text:
  610. text_str += t
  611. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
  612. # 取纯文本
  613. only_text = []
  614. for t in text:
  615. new_t = BeautifulSoup(t, "lxml").get_text()
  616. new_t = re.sub("\n", "", new_t)
  617. only_text.append(new_t)
  618. # 判断附件类型
  619. classification = from_atc_interface(' '.join(only_text))
  620. if judge_error_code(classification):
  621. classification = [str(classification[0])]
  622. # 判断长度,过长截取
  623. text = cut_str(text, only_text, max_bytes)
  624. only_text = cut_str(only_text, only_text)
  625. if len(only_text) == 0:
  626. only_text = [""]
  627. if only_text[0] == '' and len(only_text) <= 1:
  628. log_convert_result(_md5, '', '', 1,
  629. _type, None, start_time)
  630. else:
  631. log_convert_result(_md5, only_text, text, 1,
  632. _type, classification, start_time)
  633. return json.dumps({"result_html": text, "result_text": only_text,
  634. "is_success": 1, "swf_images": str(swf_images),
  635. "classification": classification})
  636. except ConnectionError:
  637. log_convert_result(_md5, [-2], "", 0,
  638. _type, None, start_time)
  639. return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
  640. "is_success": 0, "swf_images": str([]),
  641. "classification": ""})
  642. except Exception:
  643. log_convert_result(_md5, [-1], "", 0,
  644. _type, None, start_time)
  645. traceback.print_exc()
  646. return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
  647. "is_success": 0, "swf_images": str([]),
  648. "classification": ""})
  649. finally:
  650. pass
  651. # log("finally")
  652. def log_convert_result(_md5, only_text, text, is_success, _type, _attach_class, start_time):
  653. str_list = [
  654. "md5: " + str(_md5),
  655. "finished result: " + re.sub(' ', '', str(only_text)[:20]),
  656. str(len(str(text))),
  657. "is_success: " + str(is_success),
  658. str(_type),
  659. str(_attach_class),
  660. str(round(time.time()-start_time, 3)),
  661. ]
  662. info = ' '.join(str_list)
  663. log(info)
  664. def convert_old_250613(data):
  665. """
  666. 接口返回值:
  667. {[str], 1}: 处理成功
  668. {[-1], 0}: 逻辑处理错误
  669. {[-2], 0}: 接口调用错误
  670. {[-3], 1}: 文件格式错误,无法打开
  671. {[-4], 0}: 各类文件调用第三方包读取超时
  672. {[-5], 0}: 整个转换过程超时
  673. {[-6], 0}: 阿里云UDF队列超时
  674. {[-7], 1}: 文件需密码,无法打开
  675. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  676. """
  677. _global._init()
  678. _global.update({"md5": "1" + "0" * 15})
  679. set_flask_global()
  680. log("into convert")
  681. start_time = time.time()
  682. _md5 = _global.get("md5")
  683. _type = None
  684. try:
  685. _time = time.time()
  686. # 模型加入全局变量
  687. # globals().update({"global_ocr_model": ocr_model})
  688. # globals().update({"global_otr_model": otr_model})
  689. stream = base64.b64decode(data.get("file"))
  690. _type = data.get("type")
  691. _md5 = get_md5_from_bytes(stream)
  692. _md5 = _md5[0]
  693. _page_no = data.get('page_no')
  694. max_bytes = data.get("max_bytes")
  695. _global.update({"md5": _md5})
  696. if get_platform() == "Windows":
  697. # 解除超时装饰器,直接访问原函数
  698. # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  699. # text, swf_images = origin_unique_temp_file_process(stream, _type)
  700. try:
  701. text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
  702. time_out=globals().get('time_out'))
  703. except TimeoutError:
  704. log("convert time out! 300 sec")
  705. text = [-5]
  706. swf_images = []
  707. else:
  708. # Linux 通过装饰器设置整个转换超时时间
  709. try:
  710. text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
  711. time_out=globals().get('time_out'))
  712. except TimeoutError:
  713. log("convert time out! 300 sec")
  714. text = [-5]
  715. swf_images = []
  716. still_success_code = [-3, -4, -7]
  717. if judge_error_code(text):
  718. if judge_error_code(text, still_success_code):
  719. is_success = 1
  720. else:
  721. is_success = 0
  722. log("md5: " + str(_md5) + " "
  723. + "finished result: " + str(text) + " "
  724. + "is_success: " + str(is_success) + " "
  725. + str(_type) + " "
  726. + "None "
  727. + str(round(time.time() - start_time, 2)))
  728. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  729. "is_success": is_success, "swf_images": str(swf_images)}
  730. # 结果保存result.html
  731. if not MAX_COMPUTE:
  732. text_str = ""
  733. for t in text:
  734. text_str += t
  735. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
  736. # 取纯文本
  737. only_text = []
  738. for t in text:
  739. new_t = BeautifulSoup(t, "lxml").get_text()
  740. new_t = re.sub("\n", "", new_t)
  741. only_text.append(new_t)
  742. # 判断附件类型
  743. classification = from_atc_interface(' '.join(only_text))
  744. if judge_error_code(classification):
  745. classification = [str(classification[0])]
  746. # 判断长度,过长截取
  747. text = cut_str(text, only_text, max_bytes)
  748. only_text = cut_str(only_text, only_text)
  749. if len(only_text) == 0:
  750. only_text = [""]
  751. if only_text[0] == '' and len(only_text) <= 1:
  752. # print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  753. log("md5: " + str(_md5) + " "
  754. + "finished result: ['', 0] is_success: 1 "
  755. + str(_type) + " "
  756. + "None "
  757. + str(round(time.time() - start_time, 2)))
  758. else:
  759. log("md5: " + str(_md5) + " "
  760. + "finished result: " + str(only_text)[:20] + " "
  761. + str(len(str(text))) + " is_success: 1 "
  762. + str(_type) + " "
  763. + str(classification) + " "
  764. + str(round(time.time() - start_time, 2)))
  765. return {"result_html": text, "result_text": only_text,
  766. "is_success": 1, "swf_images": str(swf_images),
  767. "classification": classification}
  768. except ConnectionError:
  769. log("convert post has no data!" + " failed result: [-2] is_success: 0 "
  770. + str(round(time.time() - start_time, 2)))
  771. return {"result_html": ["-2"], "result_text": ["-2"],
  772. "is_success": 0, "swf_images": str([]),
  773. "classification": ""}
  774. except Exception as e:
  775. log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
  776. + str(_type) + " " +
  777. str(time.time() - start_time))
  778. traceback.print_exc()
  779. return {"result_html": ["-1"], "result_text": ["-1"],
  780. "is_success": 0, "swf_images": str([]),
  781. "classification": ""}
  782. finally:
  783. log("finally")
  784. def convert_old(data, ocr_model, otr_model):
  785. """
  786. 接口返回值:
  787. {[str], 1}: 处理成功
  788. {[-1], 0}: 逻辑处理错误
  789. {[-2], 0}: 接口调用错误
  790. {[-3], 1}: 文件格式错误,无法打开
  791. {[-4], 0}: 各类文件调用第三方包读取超时
  792. {[-5], 0}: 整个转换过程超时
  793. {[-6], 0}: 阿里云UDF队列超时
  794. {[-7], 1}: 文件需密码,无法打开
  795. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  796. """
  797. log("into convert")
  798. _global._init()
  799. _global.update({"md5": "1" + "0" * 15})
  800. # set_flask_global()
  801. start_time = time.time()
  802. _md5 = _global.get("md5")
  803. _type = None
  804. try:
  805. # 模型加入全局变量
  806. globals().update({"global_ocr_model": ocr_model})
  807. globals().update({"global_otr_model": otr_model})
  808. _time = time.time()
  809. stream = base64.b64decode(data.get("file"))
  810. _type = data.get("type")
  811. _md5 = get_md5_from_bytes(stream)
  812. _md5 = _md5[0]
  813. _global.update({"md5": _md5})
  814. log("get bytes from file " + str(time.time() - _time))
  815. if get_platform() == "Windows":
  816. try:
  817. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  818. except TimeoutError:
  819. log("convert time out! 300 sec")
  820. text = [-5]
  821. swf_images = []
  822. else:
  823. # Linux 通过装饰器设置整个转换超时时间
  824. try:
  825. text, swf_images = unique_temp_file_process(stream, _type, _md5, time_out=3000)
  826. except TimeoutError:
  827. log("convert time out! 300 sec")
  828. text = [-5]
  829. swf_images = []
  830. still_success_code = [-3, -4, -7]
  831. if judge_error_code(text):
  832. if judge_error_code(text, still_success_code):
  833. is_success = 1
  834. else:
  835. is_success = 0
  836. log("md5: " + str(_md5) + " "
  837. + "finished result: " + str(text) + " "
  838. + "is_success: " + str(is_success) + " "
  839. + str(_type) + " "
  840. + "None "
  841. + str(round(time.time() - start_time, 2)))
  842. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  843. "is_success": is_success, "swf_images": str(swf_images)}
  844. # 结果保存result.html
  845. text_str = ""
  846. for t in text:
  847. text_str += t
  848. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
  849. # 取纯文本
  850. only_text = []
  851. for t in text:
  852. new_t = BeautifulSoup(t, "lxml").get_text()
  853. new_t = re.sub("\n", "", new_t)
  854. only_text.append(new_t)
  855. # 判断长度,过长截取
  856. text = cut_str(text, only_text)
  857. only_text = cut_str(only_text, only_text)
  858. if len(only_text) == 0:
  859. only_text = [""]
  860. if only_text[0] == '' and len(only_text) <= 1:
  861. print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  862. log("md5: " + str(_md5) + " "
  863. + "finished result: ['', 0] is_success: 1 "
  864. + str(_type) + " "
  865. + "None "
  866. + str(round(time.time() - start_time, 2)))
  867. else:
  868. log("md5: " + str(_md5) + " "
  869. + "finished result: " + str(only_text)[:20] + " "
  870. + str(len(str(text))) + " is_success: 1 "
  871. + str(_type) + " "
  872. + "None "
  873. + str(round(time.time() - start_time, 2)))
  874. return {"result_html": text, "result_text": only_text,
  875. "is_success": 1, "swf_images": str(swf_images)}
  876. except ConnectionError:
  877. log("convert post has no data!" + " failed result: [-2] is_success: 0 "
  878. + str(round(time.time() - start_time, 2)))
  879. return {"result_html": ["-2"], "result_text": ["-2"],
  880. "is_success": 0, "swf_images": str([])}
  881. except Exception as e:
  882. log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
  883. + str(_type) + " " +
  884. str(time.time() - start_time))
  885. traceback.print_exc()
  886. return {"result_html": ["-1"], "result_text": ["-1"],
  887. "is_success": 0, "swf_images": str([])}
  888. finally:
  889. log("finally")
  890. def test_more(_dir, process_no=None):
  891. file_path_list = []
  892. for root, dirs, files in os.walk(_dir, topdown=False):
  893. for name in files:
  894. file_path_list.append(os.path.join(root, name))
  895. start_time = time.time()
  896. i = 0
  897. for p in file_path_list:
  898. if i % 10 == 0:
  899. if process_no is not None:
  900. print("Process", process_no, i, time.time() - start_time)
  901. else:
  902. print("Loop", i, time.time() - start_time)
  903. test_one(p, from_remote=True)
  904. i += 1
  905. def test_one(p, from_remote=False):
  906. with open(p, "rb") as f:
  907. file_bytes = f.read()
  908. file_base64 = base64.b64encode(file_bytes)
  909. data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
  910. if from_remote:
  911. ocr_model = None
  912. otr_model = None
  913. _url = 'http://121.46.18.113:15010/convert'
  914. # _url = 'http://192.168.2.102:15010/convert'
  915. # _url = 'http://172.16.160.65:15010/convert'
  916. result = json.loads(request_post(_url, data, time_out=10000))
  917. with open("../result.html", "w") as f:
  918. f.write(result.get("result_text")[0])
  919. if p.split(".")[-1] == "swf":
  920. swf_images = eval(result.get("swf_images"))
  921. print(type(swf_images))
  922. # for img in swf_images:
  923. # img_bytes = base64.b64decode(img)
  924. # img = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR)
  925. # cv2.imshow("swf_images", img)
  926. # cv2.waitKey(0)
  927. else:
  928. ocr_model = ocr_interface.OcrModels().get_model()
  929. otr_model = otr_interface.OtrModels().get_model()
  930. result = convert_maxcompute(data, ocr_model, otr_model)
  931. print("result_text", result.get("result_text")[0][:20])
  932. print("is_success", result.get("is_success"))
  933. def test_duplicate(path_list, process_no=None):
  934. start_time = time.time()
  935. for i in range(500):
  936. if i % 10 == 0:
  937. if process_no is not None:
  938. print("Process", process_no, i * len(path_list), time.time() - start_time)
  939. else:
  940. print("Loop", i * len(path_list), time.time() - start_time)
  941. for p in path_list:
  942. test_one(p, from_remote=True)
  943. # global_type = ""
  944. # local_url = "http://127.0.0.1"
  945. # if get_platform() == "Windows":
  946. # _path = os.path.abspath(os.path.dirname(__file__))
  947. # else:
  948. # _path = "/home/admin"
  949. # if not os.path.exists(_path):
  950. # _path = os.path.dirname(os.path.abspath(__file__))
  951. if __name__ == '__main__':
  952. port = 15010
  953. globals().update({"md5": "1" + "0" * 15})
  954. globals().update({"port": str(port)})
  955. ip_port_dict = get_ip_port()
  956. set_flask_global()
  957. app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)