convert_test.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. import base64
  2. import concurrent.futures
  3. import json
  4. import os
  5. import random
  6. import sys
  7. import time
  8. import traceback
  9. from glob import glob
  10. import requests
  11. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  12. from pdfminer.converter import PDFPageAggregator
  13. from pdfminer.layout import LAParams, LTLine
  14. from pdfminer.pdfdocument import PDFDocument
  15. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  16. from pdfminer.pdfpage import PDFPage
  17. from pdfminer.pdfparser import PDFParser
  18. from pdfplumber import PDF
  19. from otr.table_line_pdf import _plot
  20. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  21. from format_convert.utils import get_platform, request_post, get_md5_from_bytes
  22. from format_convert.convert import to_html
  23. import multiprocessing as mp
  24. html_output_dir = os.path.dirname(os.path.abspath(__file__)) + "/../html_output/"
  25. def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=False):
  26. if type(p) == tuple:
  27. p, page_no_range, timeout, save_middle, save_html = p
  28. start_time = time.time()
  29. with open(p, "rb") as f:
  30. file_bytes = f.read()
  31. file_base64 = base64.b64encode(file_bytes)
  32. _md5 = get_md5_from_bytes(file_bytes)
  33. data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range,
  34. 'timeout': timeout, 'save_middle': save_middle}
  35. # _url = 'http://121.46.18.113:15010/convert'
  36. # _url = 'http://192.168.2.103:15010/convert'
  37. # _url = 'http://192.168.2.102:15010/convert'
  38. # _url = 'http://172.16.160.65:15010/convert'
  39. _url = 'http://127.0.0.1:15010/convert'
  40. text_str = ""
  41. try:
  42. result = json.loads(request_post(_url, data, time_out=timeout+20))
  43. for t in result.get("result_html"):
  44. text_str += t
  45. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
  46. text_str)
  47. if save_html:
  48. new_path = html_output_dir + p.split(os.sep)[-1].split('.')[0] + '.html'
  49. if 0 < len(text_str) <= 3 and text_str[0] == '-':
  50. print(new_path, text_str)
  51. else:
  52. to_html(new_path, text_str)
  53. print(_md5)
  54. print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
  55. print("result_text", result.get("result_text")[0][:20])
  56. print("is_success", result.get("is_success"))
  57. except:
  58. traceback.print_exc()
  59. print(_md5)
  60. print("is_success", 0)
  61. print(time.time()-start_time)
  62. return p, 1
  63. def test_path():
  64. # _url = 'http://121.46.18.113:15010/convert'
  65. _url = 'http://192.168.0.115:15010/convert'
  66. print(_url)
  67. p = '/data/fangjiasheng/format_conversion_maxcompute/1.png'
  68. data = {"file_path": p, "type": p.split(".")[-1], "filemd5": 100, 'page_no': '1,-1',
  69. 'timeout': 10000, 'save_middle': None}
  70. print(str(data))
  71. # result = json.loads(request_post(_url, data, time_out=1000))
  72. result = json.loads(requests.post(_url, data))
  73. text_str = ""
  74. for t in result.get("result_html"):
  75. text_str += t
  76. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
  77. text_str)
  78. print("result_text", result.get("result_text")[0][:20])
  79. print("is_success", result.get("is_success"))
  80. def test_duplicate(path_list, process_no=None):
  81. start_time = time.time()
  82. # random.shuffle(path_list)
  83. for i in range(10):
  84. if i % 10 == 0:
  85. if process_no is not None:
  86. print("Process", process_no, i*len(path_list), time.time()-start_time)
  87. else:
  88. print("Loop", i*len(path_list), time.time()-start_time)
  89. for p in path_list:
  90. test_one(p, from_remote=True)
  91. def test_maxcompute(p, page_no_range=None):
  92. from format_convert import convert
  93. start_time = time.time()
  94. with open(p, "rb") as f:
  95. file_bytes = f.read()
  96. file_base64 = base64.b64encode(file_bytes)
  97. _md5 = get_md5_from_bytes(file_bytes)
  98. data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range}
  99. result = convert.convert(data)
  100. text_str = ""
  101. for t in result.get("result_html"):
  102. text_str += t
  103. to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
  104. text_str)
  105. print(_md5)
  106. print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
  107. print("result_text", result.get("result_text")[0][:20])
  108. print("is_success", result.get("is_success"))
  109. print(time.time()-start_time)
  110. def run_files(thread_num=20):
  111. paths = glob(r'C:\Users\Administrator\Downloads\招标文件内容提取\*')
  112. temp_list = []
  113. for _path in paths:
  114. new_path = html_output_dir + _path.split(os.sep)[-1].split('.')[0] + '.html'
  115. if os.path.exists(new_path):
  116. continue
  117. temp_list.append(_path)
  118. paths = temp_list
  119. print('len(paths)', len(paths))
  120. with concurrent.futures.ThreadPoolExecutor(max_workers=thread_num) as executor:
  121. tasks = []
  122. for _path in paths:
  123. tasks.append((_path, '1,-1', 10000, None, True))
  124. # 提交任务给线程池
  125. results = executor.map(test_one, tasks)
  126. for result in results:
  127. print(result)
  128. def test_kimi():
  129. MOONSHOT_API_KEY = 'sk-ZqQBQfVBrs1lIilWVgggYqFwGcMu5pjlCeQf2SZL1KDlg1Pj'
  130. paths = glob(html_output_dir + '*.html')
  131. for p in paths[:100]:
  132. with open(p, 'r', encoding='utf-8') as f:
  133. _str = f.read()
  134. print('len(_str)', len(_str))
  135. data = {
  136. 'model': 'moonshot-v1-8k',
  137. 'messages': [
  138. {
  139. "role": "user",
  140. "content": _str
  141. }
  142. ],
  143. }
  144. _url = 'https://api.moonshot.cn/v1/tokenizers/estimate-token-count'
  145. headers = {'Content-Type': 'application/json',
  146. "Authorization": "Bearer " + MOONSHOT_API_KEY}
  147. result = requests.post(_url, json=data, data=None, headers=headers, timeout=100)
  148. print(result.text)
  149. if __name__ == '__main__':
  150. if get_platform() == "Windows":
  151. # file_path = "C:/Users/Administrator/Downloads/1672314827836.pdf"
  152. # file_path = "D:/BIDI_DOC/比地_文档/1677829036789.pdf"
  153. # file_path = "C:/Users/Administrator/Desktop/test_xls/error7.xls"
  154. # file_path = "C:/Users/Administrator/Desktop/test_doc/error15.doc"
  155. # file_path = "C:/Users/Administrator/Desktop/test_swf/error1.swf"
  156. # file_path = "C:/Users/Administrator/Desktop/test_rar/error1.rar"
  157. file_path = "C:/Users/Administrator/Desktop/test_image/error7.png"
  158. # file_path = "C:/Users/Administrator/Desktop/test_b_table/error13.pdf"
  159. # file_path = "C:/Users/Administrator/Desktop/test_pdf/表格连接error/error6.pdf"
  160. # file_path = "C:/Users/Administrator/Desktop/test_table_head/error2.pdf"
  161. else:
  162. file_path = "1660296734009.pdf"
  163. test_one(file_path, page_no_range='1,-1', timeout=1000, save_middle=None)
  164. # run_files()
  165. # test_kimi()
  166. # test_path()
  167. # file_path = "C:/Users/Administrator/Downloads/"
  168. # file_path = r"C:\Users\Administrator\Desktop\test_pdf\直接读表格线error/"
  169. # file_path = r"C:\Users\Administrator\Desktop\test_pdf\表格连接error/"
  170. # file_path = r"C:\Users\Administrator\Desktop\test_b_table/"
  171. file_path = r"C:\Users\Administrator\Desktop\test_pdf\普通error/"
  172. test_pdf_list = [['6df7f2bd5e8cac99a15a6c012e0d82a8.pdf', '34,52'],
  173. ['ca6a86753400d6dd6a1b324c5678b7fb.pdf', '18,69'],
  174. ['a8380bf795c71caf8185fb11395df138.pdf', '27,38'],
  175. ['7fd2ce6b08d086c98158b6f2fa0293b0.pdf', '32,48'],
  176. ['dd1adb4dc2014c7abcf403ef15a01eb5.pdf', '2,12'],
  177. ['error50.pdf', '1,-1'],
  178. ['error59.pdf', '1,-1'],
  179. ['error60.pdf', '1,-1'],
  180. ['error61.pdf', '1,-1'],
  181. ['error7.pdf', '39,57'],
  182. ['error8.pdf', '7,12'],
  183. ['error23.pdf', '1,-1']
  184. ]
  185. index = 11
  186. # test_one(file_path+test_pdf_list[index][0], page_no_range=test_pdf_list[index][1], from_remote=True)
  187. # 测试maxcompute模式
  188. # _process = mp.Process(target=test_maxcompute, args=(file_path, '1,-1',))
  189. # _process.start()
  190. # _process.join()