123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577 |
- import io
- import logging
- import os
- import re
- import sys
- sys.path.append(os.path.dirname(__file__) + "/../")
- import time
- import pdfminer
- import timeout_decorator
- from PIL import Image
- from format_convert.convert_image import image_preprocess
- from format_convert.convert_need_interface import from_ocr_interface, from_office_interface
- import traceback
- import cv2
- import PyPDF2
- from PyPDF2 import PdfFileReader, PdfFileWriter
- from pdfminer.pdfparser import PDFParser
- from pdfminer.pdfdocument import PDFDocument
- from pdfminer.pdfpage import PDFPage
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
- from pdfminer.converter import PDFPageAggregator
- from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar
- from format_convert import get_memory_info
- from utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity
- import fitz
- @get_memory_info.memory_decorator
- def pdf2Image(path, save_dir):
- logging.info("into pdf2Image")
- try:
- try:
- doc = fitz.open(path)
- except Exception as e:
- logging.info("pdf format error!")
- # print("pdf format error!", e)
- return [-3]
- # output_image_list = []
- output_image_dict = {}
- page_count = doc.page_count
- for page_no in range(page_count):
- # 限制pdf页数,只取前10页后10页
- if page_count > 20:
- if 10 <= page_no < page_count-10:
- # logging.info("pdf2Image: pdf pages count " + str(doc.page_count)
- # + ", only get 70 pages")
- continue
- try:
- page = doc.loadPage(page_no)
- output = save_dir + "_page" + str(page_no) + ".png"
- rotate = int(0)
- # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
- # 此处若是不做设置,默认图片大小为:792X612, dpi=96
- # (1.33333333 --> 1056x816) (2 --> 1584x1224)
- # (1.183, 2.28 --> 1920x1080)
- zoom_x = 3.
- zoom_y = 3.
- # mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
- mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
- pix = page.getPixmap(matrix=mat, alpha=False)
- pix.writePNG(output)
- pdf_image = cv2.imread(output)
- print("pdf_image", page_no, pdf_image.shape)
- # output_image_list.append([page_no, output])
- output_image_dict[int(page_no)] = output
- except ValueError as e:
- traceback.print_exc()
- if str(e) == "page not in document":
- logging.info("pdf2Image page not in document! continue..." + str(page_no))
- continue
- elif "encrypted" in str(e):
- logging.info("pdf2Image document need password " + str(page_no))
- return [-7]
- except RuntimeError as e:
- if "cannot find page" in str(e):
- logging.info("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
- continue
- else:
- traceback.print_exc()
- return [-3]
- return [output_image_dict]
- except Exception as e:
- logging.info("pdf2Image error!")
- print("pdf2Image", traceback.print_exc())
- return [-1]
- @get_memory_info.memory_decorator
- @timeout_decorator.timeout(300, timeout_exception=TimeoutError)
- def pdf_analyze(interpreter, page, device):
- logging.info("into pdf_analyze")
- # 解析pdf中的不含表格的页
- pdf_time = time.time()
- print("pdf_analyze interpreter process...")
- interpreter.process_page(page)
- print("pdf_analyze device get_result...")
- layout = device.get_result()
- logging.info("pdf2text read time " + str(time.time()-pdf_time))
- return layout
- @get_memory_info.memory_decorator
- def pdf2text(path, unique_type_dir):
- logging.info("into pdf2text")
- try:
- # pymupdf pdf to image
- save_dir = path.split(".")[-2] + "_" + path.split(".")[-1]
- output_image_dict = pdf2Image(path, save_dir)
- if judge_error_code(output_image_dict):
- return output_image_dict
- output_image_dict = output_image_dict[0]
- output_image_no_list = list(output_image_dict.keys())
- output_image_no_list.sort(key=lambda x: x)
- # 获取每页pdf提取的文字、表格的列数、轮廓点、是否含表格、页码
- # page_info_list = []
- page_info_dict = {}
- has_table_dict = {}
- no_table_dict = {}
- for page_no in output_image_no_list:
- img_path = output_image_dict.get(page_no)
- print("pdf page", page_no, "in total", output_image_no_list[-1])
- # 读不出来的跳过
- try:
- img = cv2.imread(img_path)
- img_size = img.shape
- except:
- logging.info("pdf2text read image in page fail! continue...")
- continue
- # 每张图片处理
- text, column_list, outline_points, is_table = image_preprocess(img, img_path,
- use_ocr=False)
- if judge_error_code(text):
- return text
- # page_info_list.append([text, column_list, outline_points, is_table,
- # page_no, img_size])
- page_info = [text, column_list, outline_points, is_table, img_size]
- page_info_dict[int(page_no)] = page_info
- # 包含table的和不包含table的
- if is_table:
- has_table_dict[int(page_no)] = page_info
- else:
- no_table_dict[int(page_no)] = page_info
- has_table_no_list = list(has_table_dict.keys())
- has_table_no_list.sort(key=lambda x: x)
- page_no_list = list(page_info_dict.keys())
- page_no_list.sort(key=lambda x: x)
- # 页码表格连接
- table_connect_list, connect_text_list = page_table_connect(has_table_dict)
- if judge_error_code(table_connect_list):
- return table_connect_list
- # 连接的页码
- table_connect_page_no_list = []
- for area in connect_text_list:
- table_connect_page_no_list.append(area[1])
- print("pdf2text table_connect_list", table_connect_list)
- print("connect_text_list", connect_text_list)
- # pdfminer 方式
- try:
- fp = open(path, 'rb')
- # 用文件对象创建一个PDF文档分析器
- parser = PDFParser(fp)
- # 创建一个PDF文档
- doc = PDFDocument(parser)
- # 连接分析器,与文档对象
- rsrcmgr = PDFResourceManager()
- device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- # 判断是否能读pdf
- for page in PDFPage.create_pages(doc):
- break
- except pdfminer.psparser.PSEOF as e:
- # pdfminer 读不了空白页的对象,直接使用pymupdf转换出的图片进行ocr识别
- logging.info("pdf2text " + str(e) + " use ocr read pdf!")
- text_list = []
- for page_no in page_no_list:
- logging.info("pdf2text ocr page_no " + str(page_no))
- page_info = page_info_dict.get(page_no)
- # 表格
- if page_info[3]:
- # 判断表格是否跨页连接
- area_no = 0
- jump_page = 0
- for area in table_connect_list:
- if page_no in area:
- # 只记录一次text
- if page_no == area[0]:
- image_text = connect_text_list[area_no][0]
- text_list.append([image_text, page_no, 0])
- jump_page = 1
- area_no += 1
- # 是连接页的跳过后面步骤
- if jump_page:
- continue
- # 直接取text
- image_text = page_info_dict.get(page_no)[0]
- text_list.append([image_text, page_no, 0])
- # 非表格
- else:
- with open(output_image_dict.get(page_no), "rb") as ff:
- image_stream = ff.read()
- image_text = from_ocr_interface(image_stream)
- text_list.append([image_text, page_no, 0])
- text_list.sort(key=lambda z: z[1])
- text = ""
- for t in text_list:
- text += t[0]
- return [text]
- except Exception as e:
- logging.info("pdf format error!")
- traceback.print_exc()
- return [-3]
- text_list = []
- page_no = 0
- pages = PDFPage.create_pages(doc)
- pages = list(pages)
- page_count = len(pages)
- for page in pages:
- logging.info("pdf2text pymupdf page_no " + str(page_no))
- # 限制pdf页数,只取前100页
- # if page_no >= 70:
- # logging.info("pdf2text: pdf pages only get 70 pages")
- # break
- if page_count > 20:
- if 10 <= page_no < page_count-10:
- page_no += 1
- continue
- # 判断页码在含表格页码中,直接拿已生成的text
- if page_no in has_table_no_list:
- # 判断表格是否跨页连接
- area_no = 0
- jump_page = 0
- for area in table_connect_list:
- if page_no in area:
- # 只记录一次text
- if page_no == area[0]:
- image_text = connect_text_list[area_no][0]
- text_list.append([image_text, page_no, 0])
- jump_page = 1
- area_no += 1
- # 是连接页的跳过后面步骤
- if jump_page:
- page_no += 1
- continue
- # 直接取text
- image_text = has_table_dict.get(page_no)[0]
- text_list.append([image_text, page_no, 0])
- page_no += 1
- continue
- # 不含表格的解析pdf
- else:
- if get_platform() == "Windows":
- try:
- interpreter.process_page(page)
- layout = device.get_result()
- except Exception:
- logging.info("pdf2text pdfminer read pdf page error! continue...")
- continue
- else:
- # 设置超时时间
- try:
- # 解析pdf中的不含表格的页
- if get_platform() == "Windows":
- origin_pdf_analyze = pdf_analyze.__wrapped__
- layout = origin_pdf_analyze(interpreter, page, device)
- else:
- layout = pdf_analyze(interpreter, page, device)
- except TimeoutError as e:
- logging.info("pdf2text pdfminer read pdf page time out!")
- return [-4]
- except Exception:
- logging.info("pdf2text pdfminer read pdf page error! continue...")
- continue
- # 判断该页有没有文字对象,没有则有可能是有水印
- only_image = 1
- image_count = 0
- for x in layout:
- if isinstance(x, LTTextBoxHorizontal):
- only_image = 0
- if isinstance(x, LTFigure):
- image_count += 1
- # 如果该页图片数量过多,直接ocr整页识别
- logging.info("pdf2text image_count " + str(image_count))
- if image_count >= 3:
- image_text = page_info_dict.get(page_no)[0]
- if image_text is None:
- with open(output_image_dict.get(page_no), "rb") as ff:
- image_stream = ff.read()
- image_text = from_ocr_interface(image_stream)
- if judge_error_code(image_text):
- return image_text
- page_info_dict[page_no][0] = image_text
- text_list.append([image_text, page_no, 0])
- page_no += 1
- continue
- order_list = []
- for x in layout:
- # 该对象是否是ocr识别
- ocr_flag = 0
- if get_platform() == "Windows":
- # print("x", page_no, x)
- print()
- if isinstance(x, LTTextBoxHorizontal):
- image_text = x.get_text()
- # 无法识别编码,用ocr
- if re.search('[(]cid:[0-9]+[)]', image_text):
- print(re.search('[(]cid:[0-9]+[)]', image_text))
- image_text = page_info_dict.get(page_no)[0]
- if image_text is None:
- with open(output_image_dict.get(page_no), "rb") as ff:
- image_stream = ff.read()
- image_text = from_ocr_interface(image_stream)
- if judge_error_code(image_text):
- return image_text
- page_info_dict[page_no][0] = image_text
- image_text = add_div(image_text)
- # order_list.append([image_text, page_no, x.bbox[1]])
- order_list = [[image_text, page_no, x.bbox[1]]]
- break
- else:
- image_text = add_div(image_text)
- order_list.append([image_text, page_no, x.bbox[1]])
- continue
- if isinstance(x, LTFigure):
- for image in x:
- if isinstance(image, LTImage):
- try:
- print("pdf2text LTImage size", page_no, image.width, image.height)
- image_stream = image.stream.get_data()
- # 小的图忽略
- if image.width <= 300 and image.height <= 300:
- continue
- # 有些水印导致pdf分割、读取报错
- # if image.width <= 200 and image.height<=200:
- # continue
- # img_test = Image.open(io.BytesIO(image_stream))
- # img_test.save('temp/LTImage.jpg')
- # 查看提取的图片高宽,太大则抛错用pdf输出图进行ocr识别
- img_test = Image.open(io.BytesIO(image_stream))
- if img_test.size[1] > 2000 or img_test.size[0] > 1500:
- print("pdf2text LTImage stream output size", img_test.size)
- raise Exception
- # 比较小的图则直接保存用ocr识别
- else:
- img_test.save('temp/LTImage.jpg')
- with open('temp/LTImage.jpg', "rb") as ff:
- image_stream = ff.read()
- image_text = from_ocr_interface(image_stream)
- if judge_error_code(image_text):
- return image_text
- # except pdfminer.pdftypes.PDFNotImplementedError:
- # with open(output_image_list[page_no], "rb") as ff:
- # image_stream = ff.read()
- except Exception:
- logging.info("pdf2text pdfminer read image in page " + str(page_no) +
- " fail! use pymupdf read image...")
- print(traceback.print_exc())
- image_text = page_info_dict.get(page_no)[0]
- if image_text is None:
- with open(output_image_dict.get(page_no), "rb") as ff:
- image_stream = ff.read()
- image_text = from_ocr_interface(image_stream)
- if judge_error_code(image_text):
- return image_text
- page_info_dict[page_no][0] = image_text
- ocr_flag = 1
- # 判断只拿到了水印图: 无文字输出且只有图片对象
- if image_text == "" and only_image:
- # 拆出该页pdf
- try:
- logging.info("pdf2text guess pdf has watermark")
- split_path = get_single_pdf(path, page_no)
- except:
- # 如果拆分抛异常,则大概率不是水印图,用ocr识别图片
- logging.info("pdf2text guess pdf has no watermark")
- image_text = page_info_dict.get(page_no)[0]
- if image_text is None:
- with open(output_image_dict.get(page_no), "rb") as ff:
- image_stream = ff.read()
- image_text = from_ocr_interface(image_stream)
- order_list.append([image_text, page_no, -1])
- page_info_dict[page_no][0] = image_text
- ocr_flag = 1
- continue
- if judge_error_code(split_path):
- return split_path
- # 调用office格式转换
- file_path = from_office_interface(split_path, unique_type_dir, 'html', 3)
- # if file_path == [-3]:
- # return [-3]
- if judge_error_code(file_path):
- return file_path
- # 获取html文本
- image_text = get_html_p(file_path)
- if judge_error_code(image_text):
- return image_text
- if get_platform() == "Windows":
- print("image_text", page_no, x.bbox[1], image_text)
- with open("temp" + str(x.bbox[0]) + ".jpg", "wb") as ff:
- ff.write(image_stream)
- image_text = add_div(image_text)
- if ocr_flag:
- order_list.append([image_text, page_no, -1])
- else:
- order_list.append([image_text, page_no, x.bbox[1]])
- order_list.sort(key=lambda z: z[2], reverse=True)
- # 有ocr参与识别
- if order_list[-1][2] == -1:
- ocr_order_list = [order_list[-1]]
- not_ocr_order_list = []
- not_ocr_text = ""
- # 去重,因读取失败而重复获取
- for order in order_list:
- if order[2] != -1:
- not_ocr_order_list.append(order)
- not_ocr_text += order[0]
- if string_similarity(ocr_order_list[0][0], not_ocr_text) >= 0.85:
- order_list = not_ocr_order_list
- else:
- order_list = ocr_order_list
- for order in order_list:
- text_list.append(order)
- page_no += 1
- text = ""
- for t in text_list:
- # text += add_div(t[0])
- if t[0] is not None:
- text += t[0]
- return [text]
- except UnicodeDecodeError as e:
- logging.info("pdf2text pdfminer create pages failed! " + str(e))
- return [-3]
- except Exception as e:
- logging.info("pdf2text error!")
- print("pdf2text", traceback.print_exc())
- return [-1]
- def get_single_pdf(path, page_no):
- logging.info("into get_single_pdf")
- try:
- # print("path, ", path)
- pdf_origin = PdfFileReader(path, strict=False)
- pdf_new = PdfFileWriter()
- pdf_new.addPage(pdf_origin.getPage(page_no))
- path_new = path.split(".")[0] + "_split.pdf"
- with open(path_new, "wb") as ff:
- pdf_new.write(ff)
- return path_new
- except PyPDF2.utils.PdfReadError as e:
- raise e
- except Exception as e:
- logging.info("get_single_pdf error! page " + str(page_no))
- print("get_single_pdf", traceback.print_exc())
- raise e
- def page_table_connect(has_table_dict):
- logging.info("into page_table_connect")
- if not has_table_dict:
- return [], []
- try:
- # 判断是否有页码的表格相连
- table_connect_list = []
- temp_list = []
- # 离图片顶部或底部距离,页面高度的1/7
- threshold = 7
- page_no_list = list(has_table_dict.keys())
- page_no_list.sort(key=lambda x: x)
- for i in range(1, len(page_no_list)):
- page_info = has_table_dict.get(page_no_list[i])
- last_page_info = has_table_dict.get(page_no_list[i-1])
- # 页码需相连
- if page_no_list[i] - page_no_list[i-1] == 1:
- # 上一页最后一个区域的列数和下一页第一个区域列数都为0,且相等
- if not last_page_info[1][-1] and not page_info[1][0] and \
- last_page_info[1][-1] == page_info[1][0]:
- # 上一页的轮廓点要离底部一定距离内,下一页的轮廓点要离顶部一定距离内
- if last_page_info[4][0] - last_page_info[2][-1][1][1] \
- <= int(last_page_info[4][0]/threshold) \
- and page_info[2][0][0][1] - 0 \
- <= int(page_info[4][0]/threshold):
- temp_list.append(page_no_list[i-1])
- temp_list.append(page_no_list[i])
- continue
- # 条件不符合的,存储之前保存的连接页码
- if len(temp_list) > 1:
- temp_list = list(set(temp_list))
- temp_list.sort(key=lambda x: x)
- table_connect_list.append(temp_list)
- temp_list = []
- if len(temp_list) > 1:
- temp_list = list(set(temp_list))
- temp_list.sort(key=lambda x: x)
- table_connect_list.append(temp_list)
- temp_list = []
- # 连接两页内容
- connect_text_list = []
- for area in table_connect_list:
- first_page_no = area[0]
- area_page_text = str(has_table_dict.get(first_page_no)[0])
- for i in range(1, len(area)):
- current_page_no = area[i]
- current_page_text = str(has_table_dict.get(current_page_no)[0])
- # 连接两个table
- table_prefix = re.finditer('<table border="1">', current_page_text)
- index_list = []
- for t in table_prefix:
- index_list.append(t.span())
- delete_index = index_list[0]
- current_page_text = current_page_text[:delete_index[0]] \
- + current_page_text[delete_index[1]:]
- table_suffix = re.finditer('</table>', area_page_text)
- index_list = []
- for t in table_suffix:
- index_list.append(t.span())
- delete_index = index_list[-1]
- area_page_text = area_page_text[:delete_index[0]] \
- + area_page_text[delete_index[1]:]
- area_page_text = area_page_text + current_page_text
- connect_text_list.append([area_page_text, area])
- return table_connect_list, connect_text_list
- except Exception as e:
- # print("page_table_connect", e)
- logging.info("page_table_connect error!")
- print("page_table_connect", traceback.print_exc())
- return [-1], [-1]
|