remove_pdf_seal.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. import base64
  2. import json
  3. import os
  4. import re
  5. from glob import glob
  6. import PyPDF2
  7. import cv2
  8. import fitz
  9. import six
  10. from PIL import Image
  11. from PyPDF2 import PdfFileMerger
  12. from format_convert.convert_pdf import PDFConvert
  13. from format_convert.utils import pil2np
  14. from isr.isr_interface import isr, IsrModels, remove_seal
  15. from isr.post_process import get_seal_part, replace_seal_part
  16. file_path = r'D:\BIDI_DOC\比地_文档\方案.pdf'
  17. output_path = '../format_convert/temp/a1/'
  18. output_pdf_path = r'D:\BIDI_DOC\比地_文档\方案_去印章.pdf'
  19. def get_pdf_image():
  20. obj = PDFConvert(file_path, output_path)
  21. obj.convert(limit_page_cnt=1000)
  22. def use_isr():
  23. isr_yolo_model, isr_model = IsrModels().get_model()
  24. paths = glob(output_path + '*')
  25. for p in paths:
  26. print('p', p)
  27. with open(p, 'rb') as f:
  28. img_bytes = f.read()
  29. img_base64 = base64.b64encode(img_bytes)
  30. result = isr(img_base64, isr_yolo_model, isr_model)
  31. img_new = result.get('image')
  32. if isinstance(img_new, list):
  33. img_new = cv2.imread(p)
  34. name = p.split(os.sep)[-1]
  35. new_p = output_path + 'new_' + name
  36. print('new_p', new_p)
  37. cv2.imwrite(new_p, img_new)
  38. def base64_to_pil(string):
  39. try:
  40. # my own train data
  41. string = bytes(string, 'utf-8')
  42. base64_data = base64.b64decode(string)
  43. # with open('temp.jpg', 'wb') as f:
  44. # f.write(base64_data)
  45. # print("base64_to_PIL")
  46. buf = six.BytesIO()
  47. buf.write(base64_data)
  48. buf.seek(0)
  49. img = Image.open(buf).convert('RGB')
  50. return img
  51. except Exception as e:
  52. print(e)
  53. return None
  54. def manual_detect_re_isr():
  55. isr_yolo_model, isr_model = IsrModels().get_model()
  56. paths = glob(output_path + '*.json')
  57. for p in paths:
  58. print(p)
  59. with open(p) as f:
  60. json_data = json.loads(f.read())
  61. shapes = json_data.get('shapes')
  62. image_data = json_data.get('imageData')
  63. image_pil = base64_to_pil(image_data)
  64. image_np = pil2np(image_pil)
  65. image_path = output_path + json_data.get('imagePath')
  66. lines = []
  67. boxes = []
  68. for shape in shapes:
  69. lines.append(shape['points'])
  70. [x0, y0], [x1, y1] = shape['points']
  71. box = [[int(x0), int(y0)], [int(x1), int(y1)]]
  72. # label = shape['label']
  73. boxes.append(box)
  74. part_list = get_seal_part(image_np, boxes, [])
  75. new_part_list = []
  76. for part in part_list:
  77. part_remove = remove_seal(part, isr_model)
  78. new_part_list.append(part_remove)
  79. img_replace = replace_seal_part(image_np, new_part_list, boxes)
  80. cv2.imwrite(image_path, img_replace)
  81. def image_to_pdf():
  82. paths = glob(output_path + 'new_*.png')
  83. image_index_list = []
  84. for p in paths:
  85. print(p)
  86. name = p.split(os.sep)[-1]
  87. index = int(re.search('\d+', name).group())
  88. image_index_list.append([p, index])
  89. # 去除顶部logo
  90. image_np = cv2.imread(p)
  91. image_np[:145, :, :] = 255
  92. cv2.imwrite(p, image_np)
  93. image_index_list.sort(key=lambda x: x[1])
  94. pdf_doc = fitz.open()
  95. for p, index in image_index_list:
  96. print('index', index)
  97. # 用fitz读图片
  98. img = fitz.open(p)
  99. # 将图片转换为图片pdf
  100. img_pdf_page = img.convert_to_pdf()
  101. # 用fitz读图片pdf
  102. img_pdf_page = fitz.open("pdf", img_pdf_page)
  103. # 将图片pdf插入到主pdf中
  104. pdf_doc.insert_pdf(img_pdf_page, from_page=0, to_page=len(img_pdf_page) - 1)
  105. img.close()
  106. # 将结果写入新的PDF文件
  107. pdf_doc.save(output_pdf_path)
  108. pdf_doc.close()
  109. if __name__ == '__main__':
  110. image_to_pdf()