convert_test.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. import base64
  2. import json
  3. import os
  4. import random
  5. import sys
  6. import time
  7. from glob import glob
  8. import requests
  9. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  10. from pdfminer.converter import PDFPageAggregator
  11. from pdfminer.layout import LAParams, LTLine
  12. from pdfminer.pdfdocument import PDFDocument
  13. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  14. from pdfminer.pdfpage import PDFPage
  15. from pdfminer.pdfparser import PDFParser
  16. from pdfplumber import PDF
  17. from otr.table_line_pdf import _plot
  18. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  19. from format_convert.utils import get_platform, request_post, get_md5_from_bytes
  20. from format_convert.convert import to_html
  21. import multiprocessing as mp
  22. def test_one(p, page_no_range=None, from_remote=False, timeout=300, save_middle=None):
  23. start_time = time.time()
  24. with open(p, "rb") as f:
  25. file_bytes = f.read()
  26. file_base64 = base64.b64encode(file_bytes)
  27. _md5 = get_md5_from_bytes(file_bytes)
  28. data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range,
  29. 'timeout': timeout, 'save_middle': save_middle}
  30. if from_remote:
  31. _url = 'http://121.46.18.113:15010/convert'
  32. # _url = 'http://192.168.2.103:15010/convert'
  33. # _url = 'http://192.168.2.102:15011/convert'
  34. # _url = 'http://172.16.160.65:15010/convert'
  35. # _url = 'http://127.0.0.1:15010/convert'
  36. result = json.loads(request_post(_url, data, time_out=timeout+20))
  37. text_str = ""
  38. for t in result.get("result_html"):
  39. text_str += t
  40. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
  41. text_str)
  42. else:
  43. print("only support remote!")
  44. print(_md5)
  45. print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
  46. print("result_text", result.get("result_text")[0][:20])
  47. print("is_success", result.get("is_success"))
  48. print(time.time()-start_time)
  49. def test_path():
  50. # _url = 'http://121.46.18.113:15010/convert'
  51. _url = 'http://192.168.0.115:15010/convert'
  52. print(_url)
  53. p = '/data/fangjiasheng/format_conversion_maxcompute/1.png'
  54. data = {"file_path": p, "type": p.split(".")[-1], "filemd5": 100, 'page_no': '1,-1',
  55. 'timeout': 10000, 'save_middle': None}
  56. print(str(data))
  57. # result = json.loads(request_post(_url, data, time_out=1000))
  58. result = json.loads(requests.post(_url, data))
  59. text_str = ""
  60. for t in result.get("result_html"):
  61. text_str += t
  62. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
  63. text_str)
  64. print("result_text", result.get("result_text")[0][:20])
  65. print("is_success", result.get("is_success"))
  66. def test_duplicate(path_list, process_no=None):
  67. start_time = time.time()
  68. # random.shuffle(path_list)
  69. for i in range(10):
  70. if i % 10 == 0:
  71. if process_no is not None:
  72. print("Process", process_no, i*len(path_list), time.time()-start_time)
  73. else:
  74. print("Loop", i*len(path_list), time.time()-start_time)
  75. for p in path_list:
  76. test_one(p, from_remote=True)
  77. def test_maxcompute(p, page_no_range=None):
  78. from format_convert import convert
  79. start_time = time.time()
  80. with open(p, "rb") as f:
  81. file_bytes = f.read()
  82. file_base64 = base64.b64encode(file_bytes)
  83. _md5 = get_md5_from_bytes(file_bytes)
  84. data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range}
  85. result = convert.convert(data)
  86. text_str = ""
  87. for t in result.get("result_html"):
  88. text_str += t
  89. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
  90. text_str)
  91. print(_md5)
  92. print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
  93. print("result_text", result.get("result_text")[0][:20])
  94. print("is_success", result.get("is_success"))
  95. print(time.time()-start_time)
  96. if __name__ == '__main__':
  97. if get_platform() == "Windows":
  98. # file_path = "C:/Users/Administrator/Desktop/2.png"
  99. # file_path = "C:/Users/Administrator/Desktop/test_xls/error4.xls"
  100. # file_path = "C:/Users/Administrator/Desktop/test_doc/error5.doc"
  101. # file_path = "D:/BIDI_DOC/比地_文档/1677829036789.pdf"
  102. # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
  103. # file_path = "C:/Users/Administrator/Downloads/W020230512399773694376.jpg"
  104. # file_path = "C:/Users/Administrator/Desktop/test_doc/error14.docx"
  105. file_path = "C:/Users/Administrator/Desktop/test_image/error9-1.png"
  106. # file_path = "C:/Users/Administrator/Desktop/test_b_table/error1.png"
  107. # file_path = "C:/Users/Administrator/Desktop/test_pdf/直接读表格线error/error62.pdf"
  108. # file_path = "C:/save_b_table/0-0895e32470613dd7be1139eefd1342c4.png"
  109. else:
  110. file_path = "1660296734009.pdf"
  111. test_one(file_path, page_no_range='1,-1', from_remote=True, timeout=1000, save_middle=None)
  112. # test_path()
  113. # file_path = "C:/Users/Administrator/Downloads/"
  114. # file_path = r"C:\Users\Administrator\Desktop\test_pdf\直接读表格线error/"
  115. # file_path = r"C:\Users\Administrator\Desktop\test_pdf\表格连接error/"
  116. # file_path = r"C:\Users\Administrator\Desktop\test_b_table/"
  117. file_path = r"C:\Users\Administrator\Desktop\test_pdf\普通error/"
  118. test_pdf_list = [['6df7f2bd5e8cac99a15a6c012e0d82a8.pdf', '34,52'],
  119. ['ca6a86753400d6dd6a1b324c5678b7fb.pdf', '18,69'],
  120. ['a8380bf795c71caf8185fb11395df138.pdf', '27,38'],
  121. ['7fd2ce6b08d086c98158b6f2fa0293b0.pdf', '32,48'],
  122. ['dd1adb4dc2014c7abcf403ef15a01eb5.pdf', '2,12'],
  123. ['error50.pdf', '1,-1'],
  124. ['error59.pdf', '1,-1'],
  125. ['error60.pdf', '1,-1'],
  126. ['error61.pdf', '1,-1'],
  127. ['error7.pdf', '39,57'],
  128. ['error8.pdf', '7,12'],
  129. ['error23.pdf', '1,-1']
  130. ]
  131. index = 11
  132. # test_one(file_path+test_pdf_list[index][0], page_no_range=test_pdf_list[index][1], from_remote=True)
  133. # from pdfplumber.table import TableFinder
  134. # fp = open(file_path+test_pdf_list[index][0], 'rb')
  135. # parser = PDFParser(fp)
  136. # doc_pdfminer = PDFDocument(parser)
  137. # rsrcmgr = PDFResourceManager()
  138. # laparams = LAParams(line_overlap=0.01,
  139. # char_margin=0.3,
  140. # line_margin=0.01,
  141. # word_margin=0.01,
  142. # boxes_flow=0.1, )
  143. # device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  144. # interpreter = PDFPageInterpreter(rsrcmgr, device)
  145. # doc_top = 0
  146. # doc_pdfplumber = PDF(fp)
  147. # pages = PDFPage.create_pages(doc_pdfminer)
  148. # from pdfplumber.page import Page as pdfPage
  149. # for page in pages:
  150. # page_plumber = pdfPage(doc_pdfplumber, page, page_number=1, initial_doctop=doc_top)
  151. # table_finder = TableFinder(page_plumber)
  152. # all_width_zero = True
  153. # for _edge in table_finder.get_edges():
  154. # if _edge.get('linewidth') and _edge.get('linewidth') > 0:
  155. # all_width_zero = False
  156. # break
  157. # lt_line_list = []
  158. # for _edge in table_finder.get_edges():
  159. # # print(_edge)
  160. # if _edge.get('linewidth', 0.1) > 0 or all_width_zero:
  161. # lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
  162. # (float(_edge["x1"]), float(_edge["y1"]))))
  163. # _plot(lt_line_list, 'table', 1, 1)
  164. # 测试maxcompute模式
  165. # _process = mp.Process(target=test_maxcompute, args=(file_path, '1,-1',))
  166. # _process.start()
  167. # _process.join()