import base64 import json import os import re from glob import glob import PyPDF2 import cv2 import fitz import six from PIL import Image from PyPDF2 import PdfFileMerger from format_convert.convert_pdf import PDFConvert from format_convert.utils import pil2np from isr.isr_interface import isr, IsrModels, remove_seal from isr.post_process import get_seal_part, replace_seal_part file_path = r'D:\BIDI_DOC\比地_文档\方案.pdf' output_path = '../format_convert/temp/a1/' output_pdf_path = r'D:\BIDI_DOC\比地_文档\方案_去印章.pdf' def get_pdf_image(): obj = PDFConvert(file_path, output_path) obj.convert(limit_page_cnt=1000) def use_isr(): isr_yolo_model, isr_model = IsrModels().get_model() paths = glob(output_path + '*') for p in paths: print('p', p) with open(p, 'rb') as f: img_bytes = f.read() img_base64 = base64.b64encode(img_bytes) result = isr(img_base64, isr_yolo_model, isr_model) img_new = result.get('image') if isinstance(img_new, list): img_new = cv2.imread(p) name = p.split(os.sep)[-1] new_p = output_path + 'new_' + name print('new_p', new_p) cv2.imwrite(new_p, img_new) def base64_to_pil(string): try: # my own train data string = bytes(string, 'utf-8') base64_data = base64.b64decode(string) # with open('temp.jpg', 'wb') as f: # f.write(base64_data) # print("base64_to_PIL") buf = six.BytesIO() buf.write(base64_data) buf.seek(0) img = Image.open(buf).convert('RGB') return img except Exception as e: print(e) return None def manual_detect_re_isr(): isr_yolo_model, isr_model = IsrModels().get_model() paths = glob(output_path + '*.json') for p in paths: print(p) with open(p) as f: json_data = json.loads(f.read()) shapes = json_data.get('shapes') image_data = json_data.get('imageData') image_pil = base64_to_pil(image_data) image_np = pil2np(image_pil) image_path = output_path + json_data.get('imagePath') lines = [] boxes = [] for shape in shapes: lines.append(shape['points']) [x0, y0], [x1, y1] = shape['points'] box = [[int(x0), int(y0)], [int(x1), int(y1)]] # label = shape['label'] boxes.append(box) part_list = get_seal_part(image_np, boxes, []) new_part_list = [] for part in part_list: part_remove = remove_seal(part, isr_model) new_part_list.append(part_remove) img_replace = replace_seal_part(image_np, new_part_list, boxes) cv2.imwrite(image_path, img_replace) def image_to_pdf(): paths = glob(output_path + 'new_*.png') image_index_list = [] for p in paths: print(p) name = p.split(os.sep)[-1] index = int(re.search('\d+', name).group()) image_index_list.append([p, index]) # 去除顶部logo image_np = cv2.imread(p) image_np[:145, :, :] = 255 cv2.imwrite(p, image_np) image_index_list.sort(key=lambda x: x[1]) pdf_doc = fitz.open() for p, index in image_index_list: print('index', index) # 用fitz读图片 img = fitz.open(p) # 将图片转换为图片pdf img_pdf_page = img.convert_to_pdf() # 用fitz读图片pdf img_pdf_page = fitz.open("pdf", img_pdf_page) # 将图片pdf插入到主pdf中 pdf_doc.insert_pdf(img_pdf_page, from_page=0, to_page=len(img_pdf_page) - 1) img.close() # 将结果写入新的PDF文件 pdf_doc.save(output_pdf_path) pdf_doc.close() if __name__ == '__main__': image_to_pdf()