123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- import base64
- import json
- import os
- import re
- from glob import glob
- import PyPDF2
- import cv2
- import fitz
- import six
- from PIL import Image
- from PyPDF2 import PdfFileMerger
- from format_convert.convert_pdf import PDFConvert
- from format_convert.utils import pil2np
- from isr.isr_interface import isr, IsrModels, remove_seal
- from isr.post_process import get_seal_part, replace_seal_part
- file_path = r'D:\BIDI_DOC\比地_文档\方案.pdf'
- output_path = '../format_convert/temp/a1/'
- output_pdf_path = r'D:\BIDI_DOC\比地_文档\方案_去印章.pdf'
- def get_pdf_image():
- obj = PDFConvert(file_path, output_path)
- obj.convert(limit_page_cnt=1000)
- def use_isr():
- isr_yolo_model, isr_model = IsrModels().get_model()
- paths = glob(output_path + '*')
- for p in paths:
- print('p', p)
- with open(p, 'rb') as f:
- img_bytes = f.read()
- img_base64 = base64.b64encode(img_bytes)
- result = isr(img_base64, isr_yolo_model, isr_model)
- img_new = result.get('image')
- if isinstance(img_new, list):
- img_new = cv2.imread(p)
- name = p.split(os.sep)[-1]
- new_p = output_path + 'new_' + name
- print('new_p', new_p)
- cv2.imwrite(new_p, img_new)
- def base64_to_pil(string):
- try:
- # my own train data
- string = bytes(string, 'utf-8')
- base64_data = base64.b64decode(string)
- # with open('temp.jpg', 'wb') as f:
- # f.write(base64_data)
- # print("base64_to_PIL")
- buf = six.BytesIO()
- buf.write(base64_data)
- buf.seek(0)
- img = Image.open(buf).convert('RGB')
- return img
- except Exception as e:
- print(e)
- return None
- def manual_detect_re_isr():
- isr_yolo_model, isr_model = IsrModels().get_model()
- paths = glob(output_path + '*.json')
- for p in paths:
- print(p)
- with open(p) as f:
- json_data = json.loads(f.read())
- shapes = json_data.get('shapes')
- image_data = json_data.get('imageData')
- image_pil = base64_to_pil(image_data)
- image_np = pil2np(image_pil)
- image_path = output_path + json_data.get('imagePath')
- lines = []
- boxes = []
- for shape in shapes:
- lines.append(shape['points'])
- [x0, y0], [x1, y1] = shape['points']
- box = [[int(x0), int(y0)], [int(x1), int(y1)]]
- # label = shape['label']
- boxes.append(box)
- part_list = get_seal_part(image_np, boxes, [])
- new_part_list = []
- for part in part_list:
- part_remove = remove_seal(part, isr_model)
- new_part_list.append(part_remove)
- img_replace = replace_seal_part(image_np, new_part_list, boxes)
- cv2.imwrite(image_path, img_replace)
- def image_to_pdf():
- paths = glob(output_path + 'new_*.png')
- image_index_list = []
- for p in paths:
- print(p)
- name = p.split(os.sep)[-1]
- index = int(re.search('\d+', name).group())
- image_index_list.append([p, index])
- # 去除顶部logo
- image_np = cv2.imread(p)
- image_np[:145, :, :] = 255
- cv2.imwrite(p, image_np)
- image_index_list.sort(key=lambda x: x[1])
- pdf_doc = fitz.open()
- for p, index in image_index_list:
- print('index', index)
- # 用fitz读图片
- img = fitz.open(p)
- # 将图片转换为图片pdf
- img_pdf_page = img.convert_to_pdf()
- # 用fitz读图片pdf
- img_pdf_page = fitz.open("pdf", img_pdf_page)
- # 将图片pdf插入到主pdf中
- pdf_doc.insert_pdf(img_pdf_page, from_page=0, to_page=len(img_pdf_page) - 1)
- img.close()
- # 将结果写入新的PDF文件
- pdf_doc.save(output_pdf_path)
- pdf_doc.close()
- if __name__ == '__main__':
- image_to_pdf()
|