convert.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883
  1. #-*- coding: utf-8 -*-
  2. import gc
  3. import json
  4. import sys
  5. import os
  6. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  7. # 强制tf使用cpu
  8. os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
  9. from format_convert.utils import judge_error_code, request_post, get_intranet_ip, get_ip_port, get_logger, log, \
  10. set_flask_global, get_md5_from_bytes, memory_decorator
  11. from format_convert.convert_doc import doc2text, DocConvert
  12. from format_convert.convert_docx import docx2text, DocxConvert
  13. from format_convert.convert_image import picture2text, ImageConvert
  14. from format_convert.convert_pdf import pdf2text, PDFConvert
  15. from format_convert.convert_rar import rar2text, RarConvert
  16. from format_convert.convert_swf import swf2text, SwfConvert
  17. from format_convert.convert_txt import txt2text, TxtConvert
  18. from format_convert.convert_xls import xls2text, XlsConvert
  19. from format_convert.convert_xlsx import xlsx2text, XlsxConvert
  20. from format_convert.convert_zip import zip2text, ZipConvert
  21. from format_convert.convert_need_interface import from_atc_interface
  22. import hashlib
  23. from format_convert.judge_platform import get_platform
  24. from ocr import ocr_interface
  25. from otr import otr_interface
  26. import re
  27. import shutil
  28. import base64
  29. import time
  30. import uuid
  31. import logging
  32. from bs4 import BeautifulSoup
  33. from flask import Flask, request, g
  34. import inspect
  35. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  36. from format_convert.table_correct import *
  37. from format_convert.wrapt_timeout_decorator import *
  38. from format_convert import _global
  39. MAX_COMPUTE = False
  40. port_num = [0]
  41. def choose_port():
  42. process_num = 4
  43. if port_num[0] % process_num == 0:
  44. _url = local_url + ":15011"
  45. elif port_num[0] % process_num == 1:
  46. _url = local_url + ":15012"
  47. elif port_num[0] % process_num == 2:
  48. _url = local_url + ":15013"
  49. elif port_num[0] % process_num == 3:
  50. _url = local_url + ":15014"
  51. port_num[0] = port_num[0] + 1
  52. return _url
  53. @memory_decorator
  54. def getText(_type, path_or_stream, time_out=300):
  55. @timeout(time_out, timeout_exception=TimeoutError, use_signals=False)
  56. def get_html_1(_class):
  57. return _class.get_html()
  58. @timeout(600, timeout_exception=TimeoutError, use_signals=False)
  59. def get_html_2(_class):
  60. return _class.get_html()
  61. log("file type - " + _type)
  62. try:
  63. ss = path_or_stream.split(".")
  64. unique_type_dir = ss[-2] + "_" + ss[-1] + os.sep
  65. except:
  66. unique_type_dir = path_or_stream + "_" + _type + os.sep
  67. if _type == "pdf":
  68. if MAX_COMPUTE:
  69. return PDFConvert(path_or_stream, unique_type_dir).get_html()
  70. return get_html_1(PDFConvert(path_or_stream, unique_type_dir))
  71. if _type == "docx":
  72. if MAX_COMPUTE:
  73. return DocxConvert(path_or_stream, unique_type_dir).get_html()
  74. return get_html_1(DocxConvert(path_or_stream, unique_type_dir))
  75. if _type == "zip":
  76. return ZipConvert(path_or_stream, unique_type_dir).get_html()
  77. # return get_html_2(ZipConvert(path_or_stream, unique_type_dir))
  78. if _type == "rar":
  79. return RarConvert(path_or_stream, unique_type_dir).get_html()
  80. # return get_html_2(RarConvert(path_or_stream, unique_type_dir))
  81. if _type == "xlsx":
  82. if MAX_COMPUTE:
  83. return XlsxConvert(path_or_stream, unique_type_dir).get_html()
  84. return get_html_1(XlsxConvert(path_or_stream, unique_type_dir))
  85. if _type == "xls":
  86. if MAX_COMPUTE:
  87. return XlsConvert(path_or_stream, unique_type_dir).get_html()
  88. return get_html_1(XlsConvert(path_or_stream, unique_type_dir))
  89. if _type == "doc":
  90. if MAX_COMPUTE:
  91. return DocConvert(path_or_stream, unique_type_dir).get_html()
  92. return get_html_1(DocConvert(path_or_stream, unique_type_dir))
  93. if _type == "jpg" or _type == "png" or _type == "jpeg":
  94. if MAX_COMPUTE:
  95. return ImageConvert(path_or_stream, unique_type_dir).get_html()
  96. return get_html_1(ImageConvert(path_or_stream, unique_type_dir))
  97. if _type == "swf":
  98. if MAX_COMPUTE:
  99. return SwfConvert(path_or_stream, unique_type_dir).get_html()
  100. return get_html_1(SwfConvert(path_or_stream, unique_type_dir))
  101. if _type == "txt":
  102. if MAX_COMPUTE:
  103. return TxtConvert(path_or_stream, unique_type_dir).get_html()
  104. return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
  105. return [""]
  106. def to_html(path, text):
  107. with open(path, 'w',encoding="utf8") as f:
  108. f.write("<!DOCTYPE HTML>")
  109. f.write('<head><meta charset="UTF-8"></head>')
  110. f.write("<body>")
  111. f.write(text)
  112. f.write("</body>")
  113. def resize_image(image_path, size):
  114. try:
  115. image_np = cv2.imread(image_path)
  116. # print(image_np.shape)
  117. width = image_np.shape[1]
  118. height = image_np.shape[0]
  119. h_w_rate = height / width
  120. # width_standard = 900
  121. # height_standard = 1400
  122. width_standard = size[1]
  123. height_standard = size[0]
  124. width_new = int(height_standard / h_w_rate)
  125. height_new = int(width_standard * h_w_rate)
  126. if width > width_standard:
  127. image_np = cv2.resize(image_np, (width_standard, height_new))
  128. elif height > height_standard:
  129. image_np = cv2.resize(image_np, (width_new, height_standard))
  130. cv2.imwrite(image_path, image_np)
  131. # print("resize_image", image_np.shape)
  132. return
  133. except Exception as e:
  134. log("resize_image")
  135. print("resize_image", e, global_type)
  136. return
  137. def remove_red_seal(image_np):
  138. """
  139. 去除红色印章
  140. """
  141. # 获得红色通道
  142. blue_c, green_c, red_c = cv2.split(image_np)
  143. # 多传入一个参数cv2.THRESH_OTSU,并且把阈值thresh设为0,算法会找到最优阈值
  144. thresh, ret = cv2.threshold(red_c, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
  145. # print("remove_red_seal thresh", thresh)
  146. # 实测调整为95%效果好一些
  147. filter_condition = int(thresh * 0.98)
  148. thresh1, red_thresh = cv2.threshold(red_c, filter_condition, 255, cv2.THRESH_BINARY)
  149. # 把图片转回 3 通道
  150. image_and = np.expand_dims(red_thresh, axis=2)
  151. image_and = np.concatenate((image_and, image_and, image_and), axis=-1)
  152. # print(image_and.shape)
  153. # 膨胀
  154. gray = cv2.cvtColor(image_and, cv2.COLOR_RGB2GRAY)
  155. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
  156. erode = cv2.erode(gray, kernel)
  157. cv2.imshow("erode", erode)
  158. cv2.waitKey(0)
  159. image_and = np.bitwise_and(cv2.bitwise_not(blue_c), cv2.bitwise_not(erode))
  160. result_img = cv2.bitwise_not(image_and)
  161. cv2.imshow("remove_red_seal", result_img)
  162. cv2.waitKey(0)
  163. return result_img
  164. def remove_underline(image_np):
  165. """
  166. 去除文字下划线
  167. """
  168. # 灰度化
  169. gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
  170. # 二值化
  171. binary = cv2.adaptiveThreshold(~gray, 255,
  172. cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
  173. 15, 10)
  174. # Sobel
  175. kernel_row = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], np.float32)
  176. kernel_col = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], np.float32)
  177. # binary = cv2.filter2D(binary, -1, kernel=kernel)
  178. binary_row = cv2.filter2D(binary, -1, kernel=kernel_row)
  179. binary_col = cv2.filter2D(binary, -1, kernel=kernel_col)
  180. cv2.imshow("custom_blur_demo", binary)
  181. cv2.waitKey(0)
  182. rows, cols = binary.shape
  183. # 识别横线
  184. scale = 5
  185. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1))
  186. erodedcol = cv2.erode(binary_row, kernel, iterations=1)
  187. cv2.imshow("Eroded Image", erodedcol)
  188. cv2.waitKey(0)
  189. dilatedcol = cv2.dilate(erodedcol, kernel, iterations=1)
  190. cv2.imshow("dilate Image", dilatedcol)
  191. cv2.waitKey(0)
  192. return
  193. def getMDFFromFile(path):
  194. _length = 0
  195. try:
  196. _md5 = hashlib.md5()
  197. with open(path, "rb") as ff:
  198. while True:
  199. data = ff.read(4096)
  200. if not data:
  201. break
  202. _length += len(data)
  203. _md5.update(data)
  204. return _md5.hexdigest(), _length
  205. except Exception as e:
  206. traceback.print_exc()
  207. return None, _length
  208. def add_html_format(text_list):
  209. new_text_list = []
  210. for t in text_list:
  211. html_t = "<!DOCTYPE HTML>\n"
  212. html_t += '<head><meta charset="UTF-8"></head>\n'
  213. html_t += "<body>\n"
  214. html_t += t
  215. html_t += "\n</body>\n"
  216. new_text_list.append(html_t)
  217. return new_text_list
  218. if get_platform() == "Windows":
  219. globals().update({"time_out": 1000})
  220. else:
  221. globals().update({"time_out": 300})
  222. # @timeout_decorator.timeout(100, timeout_exception=TimeoutError)
  223. # @timeout(globals().get("time_out"), timeout_exception=TimeoutError, use_signals=False)
  224. def unique_temp_file_process(stream, _type, _md5, time_out=300):
  225. if get_platform() == "Windows":
  226. _global._init()
  227. globals().update({"md5": _md5})
  228. _global.update({"md5": _md5})
  229. log("into unique_temp_file_process")
  230. try:
  231. # 每个调用在temp中创建一个唯一空间
  232. uid1 = uuid.uuid1().hex
  233. unique_space_path = _path + os.sep + "temp" + os.sep + uid1 + os.sep
  234. # unique_space_path = "/mnt/fangjiasheng/" + "temp/" + uid1 + "/"
  235. # 判断冲突
  236. if not os.path.exists(unique_space_path):
  237. if not os.path.exists(_path + os.sep + "temp"):
  238. os.mkdir(_path + os.sep + "temp" + os.sep)
  239. os.mkdir(unique_space_path)
  240. else:
  241. uid2 = uuid.uuid1().hex
  242. if not os.path.exists(_path + os.sep + "temp"):
  243. os.mkdir(_path + os.sep + "temp" + os.sep)
  244. os.mkdir(_path + os.sep + "temp" + os.sep + uid2 + os.sep)
  245. # os.mkdir("/mnt/" + "temp/" + uid2 + "/")
  246. # 在唯一空间中,对传入的文件也保存为唯一
  247. uid3 = uuid.uuid1().hex
  248. file_path = unique_space_path + uid3 + "." + _type
  249. with open(file_path, "wb") as ff:
  250. ff.write(stream)
  251. text = getText(_type, file_path, time_out=time_out)
  252. # 获取swf转换的图片
  253. swf_images = []
  254. if _type == "swf":
  255. image_name_list = []
  256. for root, dirs, files in os.walk(unique_space_path, topdown=False):
  257. for name in files:
  258. if name[-4:] == ".png" and "resize" not in name:
  259. image_name_list.append(name)
  260. image_name_list.sort(key=lambda x: x)
  261. for name in image_name_list:
  262. with open(os.path.join(unique_space_path, name), "rb") as f:
  263. img_bytes = f.read()
  264. swf_images.append(base64.b64encode(img_bytes))
  265. log("unique_temp_file_process len(swf_images) " + str(len(swf_images)))
  266. return text, swf_images
  267. except TimeoutError:
  268. return [-5], []
  269. except Exception as e:
  270. log("unique_temp_file_process failed!")
  271. traceback.print_exc()
  272. return [-1], []
  273. finally:
  274. print("======================================")
  275. try:
  276. if get_platform() == "Linux":
  277. # log("not delete temp file")
  278. # 删除该唯一空间下所有文件
  279. if os.path.exists(unique_space_path):
  280. shutil.rmtree(unique_space_path)
  281. except Exception as e:
  282. log("Delete Files Failed!")
  283. def cut_str(text_list, only_text_list, max_bytes_length=2000000):
  284. log("into cut_str")
  285. try:
  286. # 计算有格式总字节数
  287. bytes_length = 0
  288. for text in text_list:
  289. bytes_length += len(bytes(text, encoding='utf-8'))
  290. # print("text_list", bytes_length)
  291. # 小于直接返回
  292. if bytes_length < max_bytes_length:
  293. print("return text_list no cut")
  294. return text_list
  295. # 全部文件连接,重新计算无格式字节数
  296. all_text = ""
  297. bytes_length = 0
  298. for text in only_text_list:
  299. bytes_length += len(bytes(text, encoding='utf-8'))
  300. all_text += text
  301. # print("only_text_list", bytes_length)
  302. # 小于直接返回
  303. if bytes_length < max_bytes_length:
  304. print("return only_text_list no cut")
  305. return only_text_list
  306. # 截取字符
  307. all_text = all_text[:int(max_bytes_length/3)]
  308. # print("text bytes ", len(bytes(all_text, encoding='utf-8')))
  309. # print("return only_text_list has cut")
  310. return [all_text]
  311. except Exception as e:
  312. log("cut_str " + str(e))
  313. return ["-1"]
  314. @memory_decorator
  315. def convert_maxcompute(data, ocr_model, otr_model):
  316. """
  317. 接口返回值:
  318. {[str], 1}: 处理成功
  319. {[-1], 0}: 逻辑处理错误
  320. {[-2], 0}: 接口调用错误
  321. {[-3], 1}: 文件格式错误,无法打开
  322. {[-4], 0}: 各类文件调用第三方包读取超时
  323. {[-5], 0}: 整个转换过程超时
  324. {[-6], 0}: 阿里云UDF队列超时
  325. {[-7], 1}: 文件需密码,无法打开
  326. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  327. """
  328. # 控制内存
  329. # soft, hard = resource.getrlimit(resource.RLIMIT_AS)
  330. # resource.setrlimit(resource.RLIMIT_AS, (15 * 1024 ** 3, hard))
  331. log("into convert")
  332. start_time = time.time()
  333. _md5 = "1000000"
  334. try:
  335. # 模型加入全局变量
  336. globals().update({"global_ocr_model": ocr_model})
  337. globals().update({"global_otr_model": otr_model})
  338. stream = base64.b64decode(data.get("file"))
  339. _type = data.get("type")
  340. _md5 = get_md5_from_bytes(stream)
  341. if get_platform() == "Windows":
  342. # 解除超时装饰器,直接访问原函数
  343. origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  344. text, swf_images = origin_unique_temp_file_process(stream, _type, _md5)
  345. else:
  346. # Linux 通过装饰器设置整个转换超时时间
  347. try:
  348. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  349. except TimeoutError:
  350. log("convert time out! 1200 sec")
  351. text = [-5]
  352. swf_images = []
  353. error_code = [[-x] for x in range(1, 9)]
  354. still_success_code = [[-3], [-7]]
  355. if text in error_code:
  356. if text in still_success_code:
  357. print({"failed result": text, "is_success": 1}, time.time() - start_time)
  358. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  359. "is_success": 1}
  360. else:
  361. print({"failed result": text, "is_success": 0}, time.time() - start_time)
  362. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  363. "is_success": 0}
  364. # 结果保存result.html
  365. if get_platform() == "Windows":
  366. text_str = ""
  367. for t in text:
  368. text_str += t
  369. to_html("../result.html", text_str)
  370. # 取纯文本
  371. only_text = []
  372. for t in text:
  373. new_t = BeautifulSoup(t, "lxml").get_text()
  374. new_t = re.sub("\n", "", new_t)
  375. only_text.append(new_t)
  376. # 判断长度,过长截取
  377. text = cut_str(text, only_text)
  378. only_text = cut_str(only_text, only_text)
  379. if len(only_text) == 0:
  380. only_text = [""]
  381. if only_text[0] == '' and len(only_text) <= 1:
  382. print({"md5: ": str(_md5), "finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  383. else:
  384. print("md5: " + str(_md5), {"finished result": [str(only_text)[:20], len(str(text))],
  385. "is_success": 1}, time.time() - start_time)
  386. return {"result_html": text, "result_text": only_text, "is_success": 1}
  387. except Exception as e:
  388. print({"md5: ": str(_md5), "failed result": [-1], "is_success": 0}, time.time() - start_time)
  389. print("convert", traceback.print_exc())
  390. return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}
  391. # 接口配置
  392. app = Flask(__name__)
  393. @app.route('/convert', methods=['POST'])
  394. def _convert():
  395. """
  396. 接口返回值:
  397. {[str], 1}: 处理成功
  398. {[-1], 0}: 逻辑处理错误
  399. {[-2], 0}: 接口调用错误
  400. {[-3], 1}: 文件格式错误,无法打开
  401. {[-4], 0}: 各类文件调用第三方包读取超时
  402. {[-5], 0}: 整个转换过程超时
  403. {[-6], 0}: 阿里云UDF队列超时
  404. {[-7], 1}: 文件需密码,无法打开
  405. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  406. """
  407. # log("growth start" + str(objgraph.growth()))
  408. # log("most_common_types start" + str(objgraph.most_common_types(20)))
  409. # tracemalloc.start(25)
  410. # snapshot = tracemalloc.take_snapshot()
  411. _global._init()
  412. _global.update({"md5": "1"+"0"*15})
  413. set_flask_global()
  414. # _global.update({"port": str(port)})
  415. log("into convert")
  416. start_time = time.time()
  417. _md5 = _global.get("md5")
  418. _type = None
  419. try:
  420. _time = time.time()
  421. data = request.form
  422. if not data:
  423. log("convert no data!")
  424. raise ConnectionError
  425. file_path = data.get("file_path")
  426. if file_path is None:
  427. stream = base64.b64decode(data.get("file"))
  428. log("get bytes from file " + str(time.time()-_time))
  429. # 有路径则直接取路径打开文件
  430. else:
  431. with open(file_path, "rb") as f:
  432. stream = f.read()
  433. log("get bytes from file_path " + str(time.time()-_time))
  434. _type = data.get("type")
  435. _md5 = get_md5_from_bytes(stream)
  436. _md5 = _md5[0]
  437. _global.update({"md5": _md5})
  438. if get_platform() == "Windows":
  439. # 解除超时装饰器,直接访问原函数
  440. # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  441. # text, swf_images = origin_unique_temp_file_process(stream, _type)
  442. try:
  443. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  444. except TimeoutError:
  445. log("convert time out! 300 sec")
  446. text = [-5]
  447. swf_images = []
  448. else:
  449. # Linux 通过装饰器设置整个转换超时时间
  450. try:
  451. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  452. except TimeoutError:
  453. log("convert time out! 300 sec")
  454. text = [-5]
  455. swf_images = []
  456. still_success_code = [-3, -4, -7]
  457. if judge_error_code(text):
  458. if judge_error_code(text, still_success_code):
  459. is_success = 1
  460. else:
  461. is_success = 0
  462. log("md5: " + str(_md5)
  463. + " finished result: " + str(text)
  464. + " is_success: " + str(is_success) + " "
  465. + str(_type) + " "
  466. + " " + str(time.time() - start_time))
  467. return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
  468. "is_success": is_success, "swf_images": str(swf_images)})
  469. # 结果保存result.html
  470. # if get_platform() == "Windows":
  471. text_str = ""
  472. for t in text:
  473. text_str += t
  474. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
  475. # 取纯文本
  476. only_text = []
  477. for t in text:
  478. new_t = BeautifulSoup(t, "lxml").get_text()
  479. new_t = re.sub("\n", "", new_t)
  480. only_text.append(new_t)
  481. # 判断附件类型
  482. classification = from_atc_interface(' '.join(only_text))
  483. if judge_error_code(classification):
  484. classification = [str(classification[0])]
  485. # 判断长度,过长截取
  486. text = cut_str(text, only_text)
  487. only_text = cut_str(only_text, only_text)
  488. if len(only_text) == 0:
  489. only_text = [""]
  490. if only_text[0] == '' and len(only_text) <= 1:
  491. print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  492. log("md5: " + str(_md5) + " "
  493. + " finished result: ['', 0] is_success: 1 "
  494. + str(_type) + " "
  495. + str(time.time() - start_time))
  496. else:
  497. log("md5: " + str(_md5) +
  498. " finished result: " + str(only_text)[:20] + " "
  499. + str(len(str(text))) + " is_success: 1 "
  500. + str(_type) + " "
  501. + str(classification) + " "
  502. + str(time.time() - start_time))
  503. # log("growth end" + str(objgraph.growth()))
  504. # log("most_common_types end" + str(objgraph.most_common_types(20)))
  505. return json.dumps({"result_html": text, "result_text": only_text,
  506. "is_success": 1, "swf_images": str(swf_images),
  507. "classification": classification})
  508. except ConnectionError:
  509. log("convert post has no data!" + " failed result: [-2] is_success: 0 "
  510. + str(time.time() - start_time))
  511. return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
  512. "is_success": 0, "swf_images": str([]),
  513. "classification": ""})
  514. except Exception as e:
  515. log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
  516. + str(_type) + " " +
  517. str(time.time() - start_time))
  518. traceback.print_exc()
  519. return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
  520. "is_success": 0, "swf_images": str([]),
  521. "classification": ""})
  522. finally:
  523. # _global._del()
  524. # gc.collect()
  525. log("finally")
  526. # snapshot1 = tracemalloc.take_snapshot()
  527. # top_stats = snapshot1.compare_to(snapshot, 'lineno')
  528. # log("[ Top 20 differences ]")
  529. # for stat in top_stats[:20]:
  530. # if stat.size_diff < 0:
  531. # continue
  532. # log(stat)
  533. # gth = objgraph.growth(limit=10)
  534. # for gt in gth:
  535. # log("growth type:%s, count:%s, growth:%s" % (gt[0], gt[1], gt[2]))
  536. # # if gt[2] > 100 or gt[1] > 300:
  537. # # continue
  538. # if gt[2] < 5:
  539. # continue
  540. # _p = os.path.dirname(os.path.abspath(__file__))
  541. # objgraph.show_backrefs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5,
  542. # filename=_p + "/dots/%s_%s_backrefs.dot" % (_md5, gt[0]))
  543. # objgraph.show_refs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5,
  544. # filename=_p + "/dots/%s_%s_refs.dot" % (_md5, gt[0]))
  545. # objgraph.show_chain(
  546. # objgraph.find_backref_chain(objgraph.by_type(gt[0])[0], objgraph.is_proper_module),
  547. # filename=_p + "/dots/%s_%s_chain.dot" % (_md5, gt[0])
  548. # )
  549. def convert(data, ocr_model, otr_model):
  550. """
  551. 接口返回值:
  552. {[str], 1}: 处理成功
  553. {[-1], 0}: 逻辑处理错误
  554. {[-2], 0}: 接口调用错误
  555. {[-3], 1}: 文件格式错误,无法打开
  556. {[-4], 0}: 各类文件调用第三方包读取超时
  557. {[-5], 0}: 整个转换过程超时
  558. {[-6], 0}: 阿里云UDF队列超时
  559. {[-7], 1}: 文件需密码,无法打开
  560. :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
  561. """
  562. log("into convert")
  563. _global._init()
  564. _global.update({"md5": "1"+"0"*15})
  565. # set_flask_global()
  566. start_time = time.time()
  567. _md5 = _global.get("md5")
  568. _type = None
  569. try:
  570. # 模型加入全局变量
  571. globals().update({"global_ocr_model": ocr_model})
  572. globals().update({"global_otr_model": otr_model})
  573. _time = time.time()
  574. stream = base64.b64decode(data.get("file"))
  575. _type = data.get("type")
  576. _md5 = get_md5_from_bytes(stream)
  577. _md5 = _md5[0]
  578. _global.update({"md5": _md5})
  579. log("get bytes from file " + str(time.time()-_time))
  580. if get_platform() == "Windows":
  581. try:
  582. text, swf_images = unique_temp_file_process(stream, _type, _md5)
  583. except TimeoutError:
  584. log("convert time out! 300 sec")
  585. text = [-5]
  586. swf_images = []
  587. else:
  588. # Linux 通过装饰器设置整个转换超时时间
  589. try:
  590. text, swf_images = unique_temp_file_process(stream, _type, _md5, time_out=3000)
  591. except TimeoutError:
  592. log("convert time out! 300 sec")
  593. text = [-5]
  594. swf_images = []
  595. still_success_code = [-3, -4, -7]
  596. if judge_error_code(text):
  597. if judge_error_code(text, still_success_code):
  598. is_success = 1
  599. else:
  600. is_success = 0
  601. log("md5: " + str(_md5)
  602. + " finished result: " + str(text)
  603. + " is_success: " + str(is_success) + " "
  604. + str(_type) + " "
  605. + " " + str(time.time() - start_time))
  606. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  607. "is_success": is_success, "swf_images": str(swf_images)}
  608. # 结果保存result.html
  609. text_str = ""
  610. for t in text:
  611. text_str += t
  612. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
  613. # 取纯文本
  614. only_text = []
  615. for t in text:
  616. new_t = BeautifulSoup(t, "lxml").get_text()
  617. new_t = re.sub("\n", "", new_t)
  618. only_text.append(new_t)
  619. # 判断长度,过长截取
  620. text = cut_str(text, only_text)
  621. only_text = cut_str(only_text, only_text)
  622. if len(only_text) == 0:
  623. only_text = [""]
  624. if only_text[0] == '' and len(only_text) <= 1:
  625. print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  626. log("md5: " + str(_md5) + " "
  627. + " finished result: ['', 0] is_success: 1 "
  628. + str(_type) + " "
  629. + str(time.time() - start_time))
  630. else:
  631. log("md5: " + str(_md5) +
  632. " finished result: " + str(only_text)[:20] + " "
  633. + str(len(str(text))) + " is_success: 1 "
  634. + str(_type) + " "
  635. + str(time.time() - start_time))
  636. return {"result_html": text, "result_text": only_text,
  637. "is_success": 1, "swf_images": str(swf_images)}
  638. except ConnectionError:
  639. log("convert post has no data!" + " failed result: [-2] is_success: 0 "
  640. + str(time.time() - start_time))
  641. return {"result_html": ["-2"], "result_text": ["-2"],
  642. "is_success": 0, "swf_images": str([])}
  643. except Exception as e:
  644. log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
  645. + str(_type) + " " +
  646. str(time.time() - start_time))
  647. traceback.print_exc()
  648. return {"result_html": ["-1"], "result_text": ["-1"],
  649. "is_success": 0, "swf_images": str([])}
  650. finally:
  651. log("finally")
  652. def test_more(_dir, process_no=None):
  653. file_path_list = []
  654. for root, dirs, files in os.walk(_dir, topdown=False):
  655. for name in files:
  656. file_path_list.append(os.path.join(root, name))
  657. start_time = time.time()
  658. i = 0
  659. for p in file_path_list:
  660. if i % 10 == 0:
  661. if process_no is not None:
  662. print("Process", process_no, i, time.time()-start_time)
  663. else:
  664. print("Loop", i, time.time()-start_time)
  665. test_one(p, from_remote=True)
  666. i += 1
  667. def test_one(p, from_remote=False):
  668. with open(p, "rb") as f:
  669. file_bytes = f.read()
  670. file_base64 = base64.b64encode(file_bytes)
  671. data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
  672. if from_remote:
  673. ocr_model = None
  674. otr_model = None
  675. _url = 'http://127.0.0.1:15010/convert'
  676. # _url = 'http://192.168.2.102:15010/convert'
  677. # _url = 'http://172.16.160.65:15010/convert'
  678. result = json.loads(request_post(_url, data, time_out=10000))
  679. if p.split(".")[-1] == "swf":
  680. swf_images = eval(result.get("swf_images"))
  681. print(type(swf_images))
  682. # for img in swf_images:
  683. # img_bytes = base64.b64decode(img)
  684. # img = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR)
  685. # cv2.imshow("swf_images", img)
  686. # cv2.waitKey(0)
  687. else:
  688. ocr_model = ocr_interface.OcrModels().get_model()
  689. otr_model = otr_interface.OtrModels().get_model()
  690. result = convert_maxcompute(data, ocr_model, otr_model)
  691. print("result_text", result.get("result_text")[0][:20])
  692. print("is_success", result.get("is_success"))
  693. def test_duplicate(path_list, process_no=None):
  694. start_time = time.time()
  695. for i in range(500):
  696. if i % 10 == 0:
  697. if process_no is not None:
  698. print("Process", process_no, i*len(path_list), time.time()-start_time)
  699. else:
  700. print("Loop", i*len(path_list), time.time()-start_time)
  701. for p in path_list:
  702. test_one(p, from_remote=True)
  703. global_type = ""
  704. local_url = "http://127.0.0.1"
  705. if get_platform() == "Windows":
  706. _path = os.path.abspath(os.path.dirname(__file__))
  707. else:
  708. _path = "/home/admin"
  709. if not os.path.exists(_path):
  710. _path = os.path.dirname(os.path.abspath(__file__))
  711. if __name__ == '__main__':
  712. # convert interface
  713. if len(sys.argv) == 2:
  714. port = int(sys.argv[1])
  715. else:
  716. port = 15010
  717. globals().update({"md5": "1"+"0"*15})
  718. globals().update({"port": str(port)})
  719. # _global._init()
  720. # _global.update({"md5": "1"+"0"*15})
  721. # _global.update({"port": str(port)})
  722. # ip = get_intranet_ip()
  723. # log("my ip"+str(ip))
  724. # ip = "http://" + ip
  725. ip_port_dict = get_ip_port()
  726. ip = "http://127.0.0.1"
  727. processes = ip_port_dict.get(ip).get("convert_processes")
  728. set_flask_global()
  729. if get_platform() == "Windows":
  730. app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
  731. else:
  732. # app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
  733. app.run(port=15011)
  734. # if get_platform() == "Windows":
  735. # # file_path = "C:/Users/Administrator/Desktop/error7.jpg"
  736. # # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
  737. # # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
  738. # file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
  739. # else:
  740. # file_path = "test1.doc"
  741. # test_one(file_path, from_remote=True)
  742. # if get_platform() == "Windows":
  743. # file_dir = "D:/BIDI_DOC/比地_文档/table_images/"
  744. # else:
  745. # file_dir = "../table_images/"
  746. #
  747. # for j in range(10):
  748. # p = Process(target=test_more, args=(file_dir, j, ))
  749. # p.start()
  750. # p.join()
  751. # if get_platform() == "Windows":
  752. # # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
  753. # # "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls",
  754. # # "D:/BIDI_DOC/比地_文档/2022/Test_Interface/11111111.rar"]
  755. # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
  756. # "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"]
  757. # # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc"]
  758. #
  759. # else:
  760. # file_path_list = ["test1.pdf"]
  761. # for j in range(10):
  762. # p = Process(target=test_duplicate, args=(file_path_list, j, ))
  763. # p.start()
  764. # p.join()