convert_pdf.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577
  1. import io
  2. import logging
  3. import os
  4. import re
  5. import sys
  6. sys.path.append(os.path.dirname(__file__) + "/../")
  7. import time
  8. import pdfminer
  9. import timeout_decorator
  10. from PIL import Image
  11. from format_convert.convert_image import image_preprocess
  12. from format_convert.convert_need_interface import from_ocr_interface, from_office_interface
  13. import traceback
  14. import cv2
  15. import PyPDF2
  16. from PyPDF2 import PdfFileReader, PdfFileWriter
  17. from pdfminer.pdfparser import PDFParser
  18. from pdfminer.pdfdocument import PDFDocument
  19. from pdfminer.pdfpage import PDFPage
  20. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  21. from pdfminer.converter import PDFPageAggregator
  22. from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar
  23. from format_convert import get_memory_info
  24. from utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity
  25. import fitz
  26. @get_memory_info.memory_decorator
  27. def pdf2Image(path, save_dir):
  28. logging.info("into pdf2Image")
  29. try:
  30. try:
  31. doc = fitz.open(path)
  32. except Exception as e:
  33. logging.info("pdf format error!")
  34. # print("pdf format error!", e)
  35. return [-3]
  36. # output_image_list = []
  37. output_image_dict = {}
  38. page_count = doc.page_count
  39. for page_no in range(page_count):
  40. # 限制pdf页数,只取前10页后10页
  41. if page_count > 20:
  42. if 10 <= page_no < page_count-10:
  43. # logging.info("pdf2Image: pdf pages count " + str(doc.page_count)
  44. # + ", only get 70 pages")
  45. continue
  46. try:
  47. page = doc.loadPage(page_no)
  48. output = save_dir + "_page" + str(page_no) + ".png"
  49. rotate = int(0)
  50. # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
  51. # 此处若是不做设置,默认图片大小为:792X612, dpi=96
  52. # (1.33333333 --> 1056x816) (2 --> 1584x1224)
  53. # (1.183, 2.28 --> 1920x1080)
  54. zoom_x = 3.
  55. zoom_y = 3.
  56. # mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  57. mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  58. pix = page.getPixmap(matrix=mat, alpha=False)
  59. pix.writePNG(output)
  60. pdf_image = cv2.imread(output)
  61. print("pdf_image", page_no, pdf_image.shape)
  62. # output_image_list.append([page_no, output])
  63. output_image_dict[int(page_no)] = output
  64. except ValueError as e:
  65. traceback.print_exc()
  66. if str(e) == "page not in document":
  67. logging.info("pdf2Image page not in document! continue..." + str(page_no))
  68. continue
  69. elif "encrypted" in str(e):
  70. logging.info("pdf2Image document need password " + str(page_no))
  71. return [-7]
  72. except RuntimeError as e:
  73. if "cannot find page" in str(e):
  74. logging.info("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
  75. continue
  76. else:
  77. traceback.print_exc()
  78. return [-3]
  79. return [output_image_dict]
  80. except Exception as e:
  81. logging.info("pdf2Image error!")
  82. print("pdf2Image", traceback.print_exc())
  83. return [-1]
  84. @get_memory_info.memory_decorator
  85. @timeout_decorator.timeout(300, timeout_exception=TimeoutError)
  86. def pdf_analyze(interpreter, page, device):
  87. logging.info("into pdf_analyze")
  88. # 解析pdf中的不含表格的页
  89. pdf_time = time.time()
  90. print("pdf_analyze interpreter process...")
  91. interpreter.process_page(page)
  92. print("pdf_analyze device get_result...")
  93. layout = device.get_result()
  94. logging.info("pdf2text read time " + str(time.time()-pdf_time))
  95. return layout
  96. @get_memory_info.memory_decorator
  97. def pdf2text(path, unique_type_dir):
  98. logging.info("into pdf2text")
  99. try:
  100. # pymupdf pdf to image
  101. save_dir = path.split(".")[-2] + "_" + path.split(".")[-1]
  102. output_image_dict = pdf2Image(path, save_dir)
  103. if judge_error_code(output_image_dict):
  104. return output_image_dict
  105. output_image_dict = output_image_dict[0]
  106. output_image_no_list = list(output_image_dict.keys())
  107. output_image_no_list.sort(key=lambda x: x)
  108. # 获取每页pdf提取的文字、表格的列数、轮廓点、是否含表格、页码
  109. # page_info_list = []
  110. page_info_dict = {}
  111. has_table_dict = {}
  112. no_table_dict = {}
  113. for page_no in output_image_no_list:
  114. img_path = output_image_dict.get(page_no)
  115. print("pdf page", page_no, "in total", output_image_no_list[-1])
  116. # 读不出来的跳过
  117. try:
  118. img = cv2.imread(img_path)
  119. img_size = img.shape
  120. except:
  121. logging.info("pdf2text read image in page fail! continue...")
  122. continue
  123. # 每张图片处理
  124. text, column_list, outline_points, is_table = image_preprocess(img, img_path,
  125. use_ocr=False)
  126. if judge_error_code(text):
  127. return text
  128. # page_info_list.append([text, column_list, outline_points, is_table,
  129. # page_no, img_size])
  130. page_info = [text, column_list, outline_points, is_table, img_size]
  131. page_info_dict[int(page_no)] = page_info
  132. # 包含table的和不包含table的
  133. if is_table:
  134. has_table_dict[int(page_no)] = page_info
  135. else:
  136. no_table_dict[int(page_no)] = page_info
  137. has_table_no_list = list(has_table_dict.keys())
  138. has_table_no_list.sort(key=lambda x: x)
  139. page_no_list = list(page_info_dict.keys())
  140. page_no_list.sort(key=lambda x: x)
  141. # 页码表格连接
  142. table_connect_list, connect_text_list = page_table_connect(has_table_dict)
  143. if judge_error_code(table_connect_list):
  144. return table_connect_list
  145. # 连接的页码
  146. table_connect_page_no_list = []
  147. for area in connect_text_list:
  148. table_connect_page_no_list.append(area[1])
  149. print("pdf2text table_connect_list", table_connect_list)
  150. print("connect_text_list", connect_text_list)
  151. # pdfminer 方式
  152. try:
  153. fp = open(path, 'rb')
  154. # 用文件对象创建一个PDF文档分析器
  155. parser = PDFParser(fp)
  156. # 创建一个PDF文档
  157. doc = PDFDocument(parser)
  158. # 连接分析器,与文档对象
  159. rsrcmgr = PDFResourceManager()
  160. device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
  161. interpreter = PDFPageInterpreter(rsrcmgr, device)
  162. # 判断是否能读pdf
  163. for page in PDFPage.create_pages(doc):
  164. break
  165. except pdfminer.psparser.PSEOF as e:
  166. # pdfminer 读不了空白页的对象,直接使用pymupdf转换出的图片进行ocr识别
  167. logging.info("pdf2text " + str(e) + " use ocr read pdf!")
  168. text_list = []
  169. for page_no in page_no_list:
  170. logging.info("pdf2text ocr page_no " + str(page_no))
  171. page_info = page_info_dict.get(page_no)
  172. # 表格
  173. if page_info[3]:
  174. # 判断表格是否跨页连接
  175. area_no = 0
  176. jump_page = 0
  177. for area in table_connect_list:
  178. if page_no in area:
  179. # 只记录一次text
  180. if page_no == area[0]:
  181. image_text = connect_text_list[area_no][0]
  182. text_list.append([image_text, page_no, 0])
  183. jump_page = 1
  184. area_no += 1
  185. # 是连接页的跳过后面步骤
  186. if jump_page:
  187. continue
  188. # 直接取text
  189. image_text = page_info_dict.get(page_no)[0]
  190. text_list.append([image_text, page_no, 0])
  191. # 非表格
  192. else:
  193. with open(output_image_dict.get(page_no), "rb") as ff:
  194. image_stream = ff.read()
  195. image_text = from_ocr_interface(image_stream)
  196. text_list.append([image_text, page_no, 0])
  197. text_list.sort(key=lambda z: z[1])
  198. text = ""
  199. for t in text_list:
  200. text += t[0]
  201. return [text]
  202. except Exception as e:
  203. logging.info("pdf format error!")
  204. traceback.print_exc()
  205. return [-3]
  206. text_list = []
  207. page_no = 0
  208. pages = PDFPage.create_pages(doc)
  209. pages = list(pages)
  210. page_count = len(pages)
  211. for page in pages:
  212. logging.info("pdf2text pymupdf page_no " + str(page_no))
  213. # 限制pdf页数,只取前100页
  214. # if page_no >= 70:
  215. # logging.info("pdf2text: pdf pages only get 70 pages")
  216. # break
  217. if page_count > 20:
  218. if 10 <= page_no < page_count-10:
  219. page_no += 1
  220. continue
  221. # 判断页码在含表格页码中,直接拿已生成的text
  222. if page_no in has_table_no_list:
  223. # 判断表格是否跨页连接
  224. area_no = 0
  225. jump_page = 0
  226. for area in table_connect_list:
  227. if page_no in area:
  228. # 只记录一次text
  229. if page_no == area[0]:
  230. image_text = connect_text_list[area_no][0]
  231. text_list.append([image_text, page_no, 0])
  232. jump_page = 1
  233. area_no += 1
  234. # 是连接页的跳过后面步骤
  235. if jump_page:
  236. page_no += 1
  237. continue
  238. # 直接取text
  239. image_text = has_table_dict.get(page_no)[0]
  240. text_list.append([image_text, page_no, 0])
  241. page_no += 1
  242. continue
  243. # 不含表格的解析pdf
  244. else:
  245. if get_platform() == "Windows":
  246. try:
  247. interpreter.process_page(page)
  248. layout = device.get_result()
  249. except Exception:
  250. logging.info("pdf2text pdfminer read pdf page error! continue...")
  251. continue
  252. else:
  253. # 设置超时时间
  254. try:
  255. # 解析pdf中的不含表格的页
  256. if get_platform() == "Windows":
  257. origin_pdf_analyze = pdf_analyze.__wrapped__
  258. layout = origin_pdf_analyze(interpreter, page, device)
  259. else:
  260. layout = pdf_analyze(interpreter, page, device)
  261. except TimeoutError as e:
  262. logging.info("pdf2text pdfminer read pdf page time out!")
  263. return [-4]
  264. except Exception:
  265. logging.info("pdf2text pdfminer read pdf page error! continue...")
  266. continue
  267. # 判断该页有没有文字对象,没有则有可能是有水印
  268. only_image = 1
  269. image_count = 0
  270. for x in layout:
  271. if isinstance(x, LTTextBoxHorizontal):
  272. only_image = 0
  273. if isinstance(x, LTFigure):
  274. image_count += 1
  275. # 如果该页图片数量过多,直接ocr整页识别
  276. logging.info("pdf2text image_count " + str(image_count))
  277. if image_count >= 3:
  278. image_text = page_info_dict.get(page_no)[0]
  279. if image_text is None:
  280. with open(output_image_dict.get(page_no), "rb") as ff:
  281. image_stream = ff.read()
  282. image_text = from_ocr_interface(image_stream)
  283. if judge_error_code(image_text):
  284. return image_text
  285. page_info_dict[page_no][0] = image_text
  286. text_list.append([image_text, page_no, 0])
  287. page_no += 1
  288. continue
  289. order_list = []
  290. for x in layout:
  291. # 该对象是否是ocr识别
  292. ocr_flag = 0
  293. if get_platform() == "Windows":
  294. # print("x", page_no, x)
  295. print()
  296. if isinstance(x, LTTextBoxHorizontal):
  297. image_text = x.get_text()
  298. # 无法识别编码,用ocr
  299. if re.search('[(]cid:[0-9]+[)]', image_text):
  300. print(re.search('[(]cid:[0-9]+[)]', image_text))
  301. image_text = page_info_dict.get(page_no)[0]
  302. if image_text is None:
  303. with open(output_image_dict.get(page_no), "rb") as ff:
  304. image_stream = ff.read()
  305. image_text = from_ocr_interface(image_stream)
  306. if judge_error_code(image_text):
  307. return image_text
  308. page_info_dict[page_no][0] = image_text
  309. image_text = add_div(image_text)
  310. # order_list.append([image_text, page_no, x.bbox[1]])
  311. order_list = [[image_text, page_no, x.bbox[1]]]
  312. break
  313. else:
  314. image_text = add_div(image_text)
  315. order_list.append([image_text, page_no, x.bbox[1]])
  316. continue
  317. if isinstance(x, LTFigure):
  318. for image in x:
  319. if isinstance(image, LTImage):
  320. try:
  321. print("pdf2text LTImage size", page_no, image.width, image.height)
  322. image_stream = image.stream.get_data()
  323. # 小的图忽略
  324. if image.width <= 300 and image.height <= 300:
  325. continue
  326. # 有些水印导致pdf分割、读取报错
  327. # if image.width <= 200 and image.height<=200:
  328. # continue
  329. # img_test = Image.open(io.BytesIO(image_stream))
  330. # img_test.save('temp/LTImage.jpg')
  331. # 查看提取的图片高宽,太大则抛错用pdf输出图进行ocr识别
  332. img_test = Image.open(io.BytesIO(image_stream))
  333. if img_test.size[1] > 2000 or img_test.size[0] > 1500:
  334. print("pdf2text LTImage stream output size", img_test.size)
  335. raise Exception
  336. # 比较小的图则直接保存用ocr识别
  337. else:
  338. img_test.save('temp/LTImage.jpg')
  339. with open('temp/LTImage.jpg', "rb") as ff:
  340. image_stream = ff.read()
  341. image_text = from_ocr_interface(image_stream)
  342. if judge_error_code(image_text):
  343. return image_text
  344. # except pdfminer.pdftypes.PDFNotImplementedError:
  345. # with open(output_image_list[page_no], "rb") as ff:
  346. # image_stream = ff.read()
  347. except Exception:
  348. logging.info("pdf2text pdfminer read image in page " + str(page_no) +
  349. " fail! use pymupdf read image...")
  350. print(traceback.print_exc())
  351. image_text = page_info_dict.get(page_no)[0]
  352. if image_text is None:
  353. with open(output_image_dict.get(page_no), "rb") as ff:
  354. image_stream = ff.read()
  355. image_text = from_ocr_interface(image_stream)
  356. if judge_error_code(image_text):
  357. return image_text
  358. page_info_dict[page_no][0] = image_text
  359. ocr_flag = 1
  360. # 判断只拿到了水印图: 无文字输出且只有图片对象
  361. if image_text == "" and only_image:
  362. # 拆出该页pdf
  363. try:
  364. logging.info("pdf2text guess pdf has watermark")
  365. split_path = get_single_pdf(path, page_no)
  366. except:
  367. # 如果拆分抛异常,则大概率不是水印图,用ocr识别图片
  368. logging.info("pdf2text guess pdf has no watermark")
  369. image_text = page_info_dict.get(page_no)[0]
  370. if image_text is None:
  371. with open(output_image_dict.get(page_no), "rb") as ff:
  372. image_stream = ff.read()
  373. image_text = from_ocr_interface(image_stream)
  374. order_list.append([image_text, page_no, -1])
  375. page_info_dict[page_no][0] = image_text
  376. ocr_flag = 1
  377. continue
  378. if judge_error_code(split_path):
  379. return split_path
  380. # 调用office格式转换
  381. file_path = from_office_interface(split_path, unique_type_dir, 'html', 3)
  382. # if file_path == [-3]:
  383. # return [-3]
  384. if judge_error_code(file_path):
  385. return file_path
  386. # 获取html文本
  387. image_text = get_html_p(file_path)
  388. if judge_error_code(image_text):
  389. return image_text
  390. if get_platform() == "Windows":
  391. print("image_text", page_no, x.bbox[1], image_text)
  392. with open("temp" + str(x.bbox[0]) + ".jpg", "wb") as ff:
  393. ff.write(image_stream)
  394. image_text = add_div(image_text)
  395. if ocr_flag:
  396. order_list.append([image_text, page_no, -1])
  397. else:
  398. order_list.append([image_text, page_no, x.bbox[1]])
  399. order_list.sort(key=lambda z: z[2], reverse=True)
  400. # 有ocr参与识别
  401. if order_list[-1][2] == -1:
  402. ocr_order_list = [order_list[-1]]
  403. not_ocr_order_list = []
  404. not_ocr_text = ""
  405. # 去重,因读取失败而重复获取
  406. for order in order_list:
  407. if order[2] != -1:
  408. not_ocr_order_list.append(order)
  409. not_ocr_text += order[0]
  410. if string_similarity(ocr_order_list[0][0], not_ocr_text) >= 0.85:
  411. order_list = not_ocr_order_list
  412. else:
  413. order_list = ocr_order_list
  414. for order in order_list:
  415. text_list.append(order)
  416. page_no += 1
  417. text = ""
  418. for t in text_list:
  419. # text += add_div(t[0])
  420. if t[0] is not None:
  421. text += t[0]
  422. return [text]
  423. except UnicodeDecodeError as e:
  424. logging.info("pdf2text pdfminer create pages failed! " + str(e))
  425. return [-3]
  426. except Exception as e:
  427. logging.info("pdf2text error!")
  428. print("pdf2text", traceback.print_exc())
  429. return [-1]
  430. def get_single_pdf(path, page_no):
  431. logging.info("into get_single_pdf")
  432. try:
  433. # print("path, ", path)
  434. pdf_origin = PdfFileReader(path, strict=False)
  435. pdf_new = PdfFileWriter()
  436. pdf_new.addPage(pdf_origin.getPage(page_no))
  437. path_new = path.split(".")[0] + "_split.pdf"
  438. with open(path_new, "wb") as ff:
  439. pdf_new.write(ff)
  440. return path_new
  441. except PyPDF2.utils.PdfReadError as e:
  442. raise e
  443. except Exception as e:
  444. logging.info("get_single_pdf error! page " + str(page_no))
  445. print("get_single_pdf", traceback.print_exc())
  446. raise e
  447. def page_table_connect(has_table_dict):
  448. logging.info("into page_table_connect")
  449. if not has_table_dict:
  450. return [], []
  451. try:
  452. # 判断是否有页码的表格相连
  453. table_connect_list = []
  454. temp_list = []
  455. # 离图片顶部或底部距离,页面高度的1/7
  456. threshold = 7
  457. page_no_list = list(has_table_dict.keys())
  458. page_no_list.sort(key=lambda x: x)
  459. for i in range(1, len(page_no_list)):
  460. page_info = has_table_dict.get(page_no_list[i])
  461. last_page_info = has_table_dict.get(page_no_list[i-1])
  462. # 页码需相连
  463. if page_no_list[i] - page_no_list[i-1] == 1:
  464. # 上一页最后一个区域的列数和下一页第一个区域列数都为0,且相等
  465. if not last_page_info[1][-1] and not page_info[1][0] and \
  466. last_page_info[1][-1] == page_info[1][0]:
  467. # 上一页的轮廓点要离底部一定距离内,下一页的轮廓点要离顶部一定距离内
  468. if last_page_info[4][0] - last_page_info[2][-1][1][1] \
  469. <= int(last_page_info[4][0]/threshold) \
  470. and page_info[2][0][0][1] - 0 \
  471. <= int(page_info[4][0]/threshold):
  472. temp_list.append(page_no_list[i-1])
  473. temp_list.append(page_no_list[i])
  474. continue
  475. # 条件不符合的,存储之前保存的连接页码
  476. if len(temp_list) > 1:
  477. temp_list = list(set(temp_list))
  478. temp_list.sort(key=lambda x: x)
  479. table_connect_list.append(temp_list)
  480. temp_list = []
  481. if len(temp_list) > 1:
  482. temp_list = list(set(temp_list))
  483. temp_list.sort(key=lambda x: x)
  484. table_connect_list.append(temp_list)
  485. temp_list = []
  486. # 连接两页内容
  487. connect_text_list = []
  488. for area in table_connect_list:
  489. first_page_no = area[0]
  490. area_page_text = str(has_table_dict.get(first_page_no)[0])
  491. for i in range(1, len(area)):
  492. current_page_no = area[i]
  493. current_page_text = str(has_table_dict.get(current_page_no)[0])
  494. # 连接两个table
  495. table_prefix = re.finditer('<table border="1">', current_page_text)
  496. index_list = []
  497. for t in table_prefix:
  498. index_list.append(t.span())
  499. delete_index = index_list[0]
  500. current_page_text = current_page_text[:delete_index[0]] \
  501. + current_page_text[delete_index[1]:]
  502. table_suffix = re.finditer('</table>', area_page_text)
  503. index_list = []
  504. for t in table_suffix:
  505. index_list.append(t.span())
  506. delete_index = index_list[-1]
  507. area_page_text = area_page_text[:delete_index[0]] \
  508. + area_page_text[delete_index[1]:]
  509. area_page_text = area_page_text + current_page_text
  510. connect_text_list.append([area_page_text, area])
  511. return table_connect_list, connect_text_list
  512. except Exception as e:
  513. # print("page_table_connect", e)
  514. logging.info("page_table_connect error!")
  515. print("page_table_connect", traceback.print_exc())
  516. return [-1], [-1]