convert_test.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. import base64
  2. import concurrent.futures
  3. import json
  4. import os
  5. import random
  6. import sys
  7. import time
  8. import traceback
  9. from glob import glob
  10. import requests
  11. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  12. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  13. from format_convert.utils import get_platform, request_post, get_md5_from_bytes
  14. from format_convert.convert import to_html
  15. import multiprocessing as mp
  16. html_output_dir = os.path.dirname(os.path.abspath(__file__)) + "/../html_output/"
  17. def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=False):
  18. if type(p) == tuple:
  19. p, page_no_range, timeout, save_middle, save_html = p
  20. start_time = time.time()
  21. with open(p, "rb") as f:
  22. file_bytes = f.read()
  23. file_base64 = base64.b64encode(file_bytes)
  24. _md5 = get_md5_from_bytes(file_bytes)
  25. data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range,
  26. 'timeout': timeout, 'save_middle': save_middle}
  27. # _url = 'http://dianxin.bidizhaobiao.com:15010/convert'
  28. # _url = 'http://192.168.2.103:15010/convert'
  29. # _url = 'http://192.168.2.102:15010/convert'
  30. # _url = 'http://172.16.160.65:15010/convert'
  31. _url = 'http://127.0.0.1:15010/convert'
  32. text_str = ""
  33. try:
  34. result = json.loads(request_post(_url, data, time_out=timeout+20))
  35. print('result', result)
  36. for t in result.get("result_html"):
  37. text_str += t
  38. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
  39. text_str)
  40. if save_html:
  41. new_path = html_output_dir + p.split(os.sep)[-1].split('.')[0] + '.html'
  42. if 0 < len(text_str) <= 3 and text_str[0] == '-':
  43. print(new_path, text_str)
  44. else:
  45. to_html(new_path, text_str)
  46. print(_md5)
  47. # print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
  48. print("result_text", result.get("result_text")[0][:20])
  49. print("is_success", result.get("is_success"))
  50. except:
  51. traceback.print_exc()
  52. print(_md5)
  53. print("is_success", 0)
  54. print(time.time()-start_time)
  55. return p, 1
  56. def test_path():
  57. # _url = 'http://121.46.18.113:15010/convert'
  58. _url = 'http://192.168.0.115:15010/convert'
  59. print(_url)
  60. p = '/data/fangjiasheng/format_conversion_maxcompute/1.png'
  61. data = {"file_path": p, "type": p.split(".")[-1], "filemd5": 100, 'page_no': '1,-1',
  62. 'timeout': 10000, 'save_middle': None}
  63. print(str(data))
  64. # result = json.loads(request_post(_url, data, time_out=1000))
  65. result = json.loads(requests.post(_url, data))
  66. text_str = ""
  67. for t in result.get("result_html"):
  68. text_str += t
  69. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
  70. text_str)
  71. print("result_text", result.get("result_text")[0][:20])
  72. print("is_success", result.get("is_success"))
  73. def test_duplicate(path_list, process_no=None):
  74. start_time = time.time()
  75. # random.shuffle(path_list)
  76. for i in range(10):
  77. if i % 10 == 0:
  78. if process_no is not None:
  79. print("Process", process_no, i*len(path_list), time.time()-start_time)
  80. else:
  81. print("Loop", i*len(path_list), time.time()-start_time)
  82. for p in path_list:
  83. test_one(p, from_remote=True)
  84. def test_maxcompute(p, page_no_range=None):
  85. from format_convert import convert
  86. start_time = time.time()
  87. with open(p, "rb") as f:
  88. file_bytes = f.read()
  89. file_base64 = base64.b64encode(file_bytes)
  90. _md5 = get_md5_from_bytes(file_bytes)
  91. data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range}
  92. result = convert.convert(data)
  93. text_str = ""
  94. for t in result.get("result_html"):
  95. text_str += t
  96. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
  97. text_str)
  98. print(_md5)
  99. print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
  100. print("result_text", result.get("result_text")[0][:20])
  101. print("is_success", result.get("is_success"))
  102. print(time.time()-start_time)
  103. def run_files(thread_num=20):
  104. paths = glob(r'C:\Users\Administrator\Downloads\招标文件内容提取\*')
  105. temp_list = []
  106. for _path in paths:
  107. new_path = html_output_dir + _path.split(os.sep)[-1].split('.')[0] + '.html'
  108. if os.path.exists(new_path):
  109. continue
  110. temp_list.append(_path)
  111. paths = temp_list
  112. print('len(paths)', len(paths))
  113. with concurrent.futures.ThreadPoolExecutor(max_workers=thread_num) as executor:
  114. tasks = []
  115. for _path in paths:
  116. tasks.append((_path, '1,-1', 10000, None, True))
  117. # 提交任务给线程池
  118. results = executor.map(test_one, tasks)
  119. for result in results:
  120. print(result)
  121. def test_kimi():
  122. MOONSHOT_API_KEY = 'sk-ZqQBQfVBrs1lIilWVgggYqFwGcMu5pjlCeQf2SZL1KDlg1Pj'
  123. paths = glob(html_output_dir + '*.html')
  124. for p in paths[:100]:
  125. with open(p, 'r', encoding='utf-8') as f:
  126. _str = f.read()
  127. print('len(_str)', len(_str))
  128. data = {
  129. 'model': 'moonshot-v1-8k',
  130. 'messages': [
  131. {
  132. "role": "user",
  133. "content": _str
  134. }
  135. ],
  136. }
  137. _url = 'https://api.moonshot.cn/v1/tokenizers/estimate-token-count'
  138. headers = {'Content-Type': 'application/json',
  139. "Authorization": "Bearer " + MOONSHOT_API_KEY}
  140. result = requests.post(_url, json=data, data=None, headers=headers, timeout=100)
  141. print(result.text)
  142. if __name__ == '__main__':
  143. if get_platform() == "Windows":
  144. # file_path = "C:/Users/Administrator/Downloads/1750737587843.ofd"
  145. # file_path = r'D:\Project\format_conversion_maxcompute\save_b_table_pdf/e-1.pdf'
  146. # file_path = "D:/BIDI_DOC/比地_文档/1677829036789.pdf"
  147. # file_path = "C:/Users/Administrator/Desktop/test_xls/error4.xlsx"
  148. # file_path = "C:/Users/Administrator/Desktop/test_doc/error17.docx"
  149. # file_path = "C:/Users/Administrator/Desktop/test_swf/error2.swf"
  150. # file_path = "C:/Users/Administrator/Desktop/test_rar/error1.rar"
  151. # file_path = "C:/Users/Administrator/Desktop/test_image/error18.png"
  152. # file_path = "C:/Users/Administrator/Desktop/test_b_table/error29.png"
  153. # file_path = "C:/Users/Administrator/Desktop/test_pdf/普通error/error6.pdf"
  154. # file_path = "C:/Users/Administrator/Desktop/test_table_head/error2.pdf"
  155. # file_path = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
  156. file_path = "C:/Users/Administrator/Desktop/test_ofd/1750381792388.ofd"
  157. else:
  158. file_path = "1660296734009.pdf"
  159. # test_one(file_path, page_no_range="1,-1", timeout=1000, save_middle=None)
  160. test_one(file_path, page_no_range=None, timeout=1000, save_middle=None)
  161. # run_files()
  162. # test_kimi()
  163. # test_path()
  164. # file_path = "C:/Users/Administrator/Downloads/"
  165. # file_path = r"C:\Users\Administrator\Desktop\test_pdf\直接读表格线error/"
  166. # file_path = r"C:\Users\Administrator\Desktop\test_pdf\表格连接error/"
  167. # file_path = r"C:\Users\Administrator\Desktop\test_b_table/"
  168. # file_path = r"C:\Users\Administrator\Desktop\test_pdf\普通error/"
  169. # test_pdf_list = [['6df7f2bd5e8cac99a15a6c012e0d82a8.pdf', '34,52'],
  170. # ['ca6a86753400d6dd6a1b324c5678b7fb.pdf', '18,69'],
  171. # ['a8380bf795c71caf8185fb11395df138.pdf', '27,38'],
  172. # ['7fd2ce6b08d086c98158b6f2fa0293b0.pdf', '32,48'],
  173. # ['dd1adb4dc2014c7abcf403ef15a01eb5.pdf', '2,12'],
  174. # ['error50.pdf', '1,-1'],
  175. # ['error59.pdf', '1,-1'],
  176. # ['error60.pdf', '1,-1'],
  177. # ['error61.pdf', '1,-1'],
  178. # ['error7.pdf', '39,57'],
  179. # ['error8.pdf', '7,12'],
  180. # ['error23.pdf', '1,-1']
  181. # ]
  182. # index = 11
  183. # test_one(file_path+test_pdf_list[index][0], page_no_range=test_pdf_list[index][1], from_remote=True)
  184. # 测试maxcompute模式
  185. # _process = mp.Process(target=test_maxcompute, args=(file_path, '1,-1',))
  186. # _process.start()
  187. # _process.join()