convert_test.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. import base64
  2. import json
  3. import os
  4. import random
  5. import sys
  6. import time
  7. from glob import glob
  8. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  9. from format_convert.utils import get_platform, request_post, get_md5_from_bytes
  10. from format_convert.convert import to_html
  11. import multiprocessing as mp
  12. def test_one(p, page_no_range=None, from_remote=False):
  13. start_time = time.time()
  14. with open(p, "rb") as f:
  15. file_bytes = f.read()
  16. file_base64 = base64.b64encode(file_bytes)
  17. _md5 = get_md5_from_bytes(file_bytes)
  18. data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range}
  19. if from_remote:
  20. _url = 'http://121.46.18.113:15010/convert'
  21. # _url = 'http://192.168.2.103:15010/convert'
  22. # _url = 'http://192.168.2.102:15011/convert'
  23. # _url = 'http://172.16.160.65:15010/convert'
  24. # _url = 'http://127.0.0.1:15010/convert'
  25. result = json.loads(request_post(_url, data, time_out=10000))
  26. text_str = ""
  27. for t in result.get("result_html"):
  28. text_str += t
  29. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
  30. text_str)
  31. else:
  32. print("only support remote!")
  33. print(_md5)
  34. print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
  35. print("result_text", result.get("result_text")[0][:20])
  36. print("is_success", result.get("is_success"))
  37. print(time.time()-start_time)
  38. def test_duplicate(path_list, process_no=None):
  39. start_time = time.time()
  40. # random.shuffle(path_list)
  41. for i in range(10):
  42. if i % 10 == 0:
  43. if process_no is not None:
  44. print("Process", process_no, i*len(path_list), time.time()-start_time)
  45. else:
  46. print("Loop", i*len(path_list), time.time()-start_time)
  47. for p in path_list:
  48. test_one(p, from_remote=True)
  49. def test_maxcompute(p, page_no_range=None):
  50. from format_convert import convert
  51. start_time = time.time()
  52. with open(p, "rb") as f:
  53. file_bytes = f.read()
  54. file_base64 = base64.b64encode(file_bytes)
  55. _md5 = get_md5_from_bytes(file_bytes)
  56. data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range}
  57. result = convert.convert(data)
  58. text_str = ""
  59. for t in result.get("result_html"):
  60. text_str += t
  61. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
  62. text_str)
  63. print(_md5)
  64. print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
  65. print("result_text", result.get("result_text")[0][:20])
  66. print("is_success", result.get("is_success"))
  67. print(time.time()-start_time)
  68. if __name__ == '__main__':
  69. if get_platform() == "Windows":
  70. # file_path = "C:/Users/Administrator/Desktop/2.png"
  71. file_path = "C:/Users/Administrator/Desktop/test_xls/error4.xls"
  72. # file_path = "C:/Users/Administrator/Desktop/test_doc/error5.doc"
  73. # file_path = "D:/BIDI_DOC/比地_文档/1677829036789.pdf"
  74. # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
  75. # file_path = "C:/Users/Administrator/Downloads/1688432101601.xlsx"
  76. # file_path = "C:/Users/Administrator/Desktop/test_doc/error14.docx"
  77. # file_path = "C:/Users/Administrator/Desktop/test_image/error36.png"
  78. # file_path = "C:/Users/Administrator/Desktop/test_b_table/error1.png"
  79. # file_path = "C:/Users/Administrator/Desktop/test_pdf/表格连接error/error7.pdf"
  80. # file_path = "C:/save_b_table/0-0895e32470613dd7be1139eefd1342c4.png"
  81. else:
  82. file_path = "1660296734009.pdf"
  83. test_one(file_path, page_no_range='1,-1', from_remote=True)
  84. file_path = "C:/Users/Administrator/Downloads/"
  85. # file_path = r"C:\Users\Administrator\Desktop\test_pdf\直接读表格线error/"
  86. # file_path = r"C:\Users\Administrator\Desktop\test_pdf\表格连接error/"
  87. test_pdf_list = [['6df7f2bd5e8cac99a15a6c012e0d82a8.pdf', '34,52'],
  88. ['ca6a86753400d6dd6a1b324c5678b7fb.pdf', '18,69'],
  89. ['a8380bf795c71caf8185fb11395df138.pdf', '27,38'],
  90. ['7fd2ce6b08d086c98158b6f2fa0293b0.pdf', '32,48'],
  91. ['dd1adb4dc2014c7abcf403ef15a01eb5.pdf', '2,12'],
  92. ['error50.pdf', '1,-1'],
  93. ['error59.pdf', '1,-1'],
  94. ['error51.pdf', '1,-1'],
  95. ['error7.pdf', '39,57'],
  96. ]
  97. index = 1
  98. # test_one(file_path+test_pdf_list[index][0], page_no_range=test_pdf_list[index][1], from_remote=True)
  99. # 测试maxcompute模式
  100. # _process = mp.Process(target=test_maxcompute, args=(file_path, '1,-1',))
  101. # _process.start()
  102. # _process.join()