|
@@ -1,3 +1,4 @@
|
|
|
|
+import copy
|
|
import inspect
|
|
import inspect
|
|
import io
|
|
import io
|
|
import logging
|
|
import logging
|
|
@@ -29,6 +30,7 @@ from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LT
|
|
from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
|
|
from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
|
|
get_logger, log, memory_decorator
|
|
get_logger, log, memory_decorator
|
|
import fitz
|
|
import fitz
|
|
|
|
+from format_convert.wrapt_timeout_decorator import timeout
|
|
|
|
|
|
|
|
|
|
@memory_decorator
|
|
@memory_decorator
|
|
@@ -94,7 +96,7 @@ def pdf2Image(path, save_dir):
|
|
return [-1]
|
|
return [-1]
|
|
|
|
|
|
|
|
|
|
-@timeout_decorator.timeout(10, timeout_exception=TimeoutError)
|
|
|
|
|
|
+@timeout(10, timeout_exception=TimeoutError)
|
|
def pdf_analyze(interpreter, page, device, page_no):
|
|
def pdf_analyze(interpreter, page, device, page_no):
|
|
log("into pdf_analyze")
|
|
log("into pdf_analyze")
|
|
pdf_time = time.time()
|
|
pdf_time = time.time()
|
|
@@ -580,6 +582,73 @@ def page_table_connect(has_table_dict):
|
|
return [-1], [-1]
|
|
return [-1], [-1]
|
|
|
|
|
|
|
|
|
|
|
|
+@timeout(30, timeout_exception=TimeoutError)
|
|
|
|
+def read_pdf(path, package_name, packages):
|
|
|
|
+ log(package_name)
|
|
|
|
+ laparams = LAParams(line_overlap=0.01,
|
|
|
|
+ char_margin=0.3,
|
|
|
|
+ line_margin=0.01,
|
|
|
|
+ word_margin=0.01,
|
|
|
|
+ boxes_flow=0.1,)
|
|
|
|
+
|
|
|
|
+ if package_name == packages[0]:
|
|
|
|
+ fp = open(path, 'rb')
|
|
|
|
+ parser = PDFParser(fp)
|
|
|
|
+ doc_pdfminer = PDFDocument(parser)
|
|
|
|
+ rsrcmgr = PDFResourceManager()
|
|
|
|
+ device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
|
|
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
|
|
+ return doc_pdfminer, device, interpreter
|
|
|
|
+
|
|
|
|
+ elif package_name == packages[1]:
|
|
|
|
+ doc_pymupdf = fitz.open(path)
|
|
|
|
+ return doc_pymupdf
|
|
|
|
+
|
|
|
|
+ elif package_name == packages[2]:
|
|
|
|
+ doc_pypdf2 = PdfFileReader(path, strict=False)
|
|
|
|
+ doc_pypdf2_new = PdfFileWriter()
|
|
|
|
+ return doc_pypdf2, doc_pypdf2_new
|
|
|
|
+
|
|
|
|
+ elif package_name == packages[3]:
|
|
|
|
+ fp = open(path, 'rb')
|
|
|
|
+ lt = LineTable()
|
|
|
|
+ doc_top = 0
|
|
|
|
+ doc_pdfplumber = read_pdfplumber(fp, laparams)
|
|
|
|
+ return lt, doc_top, doc_pdfplumber
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+@timeout(25, timeout_exception=TimeoutError)
|
|
|
|
+def read_pdfminer(path, laparams):
|
|
|
|
+ fp = open(path, 'rb')
|
|
|
|
+ parser = PDFParser(fp)
|
|
|
|
+ doc_pdfminer = PDFDocument(parser)
|
|
|
|
+ rsrcmgr = PDFResourceManager()
|
|
|
|
+ device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
|
|
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
|
|
+ return doc_pdfminer, device, interpreter
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+@timeout(15, timeout_exception=TimeoutError)
|
|
|
|
+def read_pymupdf(path):
|
|
|
|
+ return fitz.open(path)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+@timeout(15, timeout_exception=TimeoutError)
|
|
|
|
+def read_pypdf2(path):
|
|
|
|
+ doc_pypdf2 = PdfFileReader(path, strict=False)
|
|
|
|
+ doc_pypdf2_new = PdfFileWriter()
|
|
|
|
+ return doc_pypdf2, doc_pypdf2_new
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+@timeout(25, timeout_exception=TimeoutError, use_signals=False)
|
|
|
|
+def read_pdfplumber(path, laparams):
|
|
|
|
+ fp = open(path, 'rb')
|
|
|
|
+ lt = LineTable()
|
|
|
|
+ doc_top = 0
|
|
|
|
+ doc_pdfplumber = PDF(fp, laparams=laparams.__dict__)
|
|
|
|
+ return lt, doc_top, doc_pdfplumber
|
|
|
|
+
|
|
|
|
+
|
|
class PDFConvert:
|
|
class PDFConvert:
|
|
def __init__(self, path, unique_type_dir):
|
|
def __init__(self, path, unique_type_dir):
|
|
self._doc = _Document(path)
|
|
self._doc = _Document(path)
|
|
@@ -595,40 +664,49 @@ class PDFConvert:
|
|
def init_package(self, package_name):
|
|
def init_package(self, package_name):
|
|
# 各个包初始化
|
|
# 各个包初始化
|
|
try:
|
|
try:
|
|
|
|
+ laparams = LAParams(line_overlap=0.01,
|
|
|
|
+ char_margin=0.3,
|
|
|
|
+ line_margin=0.01,
|
|
|
|
+ word_margin=0.01,
|
|
|
|
+ boxes_flow=0.1,)
|
|
if package_name == self.packages[0]:
|
|
if package_name == self.packages[0]:
|
|
- fp = open(self.path, 'rb')
|
|
|
|
- parser = PDFParser(fp)
|
|
|
|
- self.doc_pdfminer = PDFDocument(parser)
|
|
|
|
- rsrcmgr = PDFResourceManager()
|
|
|
|
- self.laparams = LAParams(line_overlap=0.01,
|
|
|
|
- char_margin=0.3,
|
|
|
|
- line_margin=0.01,
|
|
|
|
- word_margin=0.01,
|
|
|
|
- boxes_flow=0.1,)
|
|
|
|
- self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
|
|
|
|
- self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
|
|
|
|
|
|
+ # fp = open(self.path, 'rb')
|
|
|
|
+ # parser = PDFParser(fp)
|
|
|
|
+ # self.doc_pdfminer = PDFDocument(parser)
|
|
|
|
+ # rsrcmgr = PDFResourceManager()
|
|
|
|
+ # self.laparams = LAParams(line_overlap=0.01,
|
|
|
|
+ # char_margin=0.3,
|
|
|
|
+ # line_margin=0.01,
|
|
|
|
+ # word_margin=0.01,
|
|
|
|
+ # boxes_flow=0.1,)
|
|
|
|
+ # self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
|
|
|
|
+ # self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
|
|
|
|
+ self.doc_pdfminer, self.device, self.interpreter = read_pdfminer(self.path, laparams)
|
|
self.has_init_pdf[0] = 1
|
|
self.has_init_pdf[0] = 1
|
|
|
|
|
|
elif package_name == self.packages[1]:
|
|
elif package_name == self.packages[1]:
|
|
- self.doc_pymupdf = fitz.open(self.path)
|
|
|
|
|
|
+ self.doc_pymupdf = read_pymupdf(self.path)
|
|
self.has_init_pdf[1] = 1
|
|
self.has_init_pdf[1] = 1
|
|
|
|
|
|
elif package_name == self.packages[2]:
|
|
elif package_name == self.packages[2]:
|
|
- self.doc_pypdf2 = PdfFileReader(self.path, strict=False)
|
|
|
|
- self.doc_pypdf2_new = PdfFileWriter()
|
|
|
|
|
|
+ # self.doc_pypdf2 = PdfFileReader(self.path, strict=False)
|
|
|
|
+ # self.doc_pypdf2_new = PdfFileWriter()
|
|
|
|
+ self.doc_pypdf2, self.doc_pypdf2_new = read_pypdf2(self.path)
|
|
self.has_init_pdf[2] = 1
|
|
self.has_init_pdf[2] = 1
|
|
|
|
|
|
elif package_name == self.packages[3]:
|
|
elif package_name == self.packages[3]:
|
|
- self.fp = open(self.path, 'rb')
|
|
|
|
- self.lt = LineTable()
|
|
|
|
- self.doc_top = 0
|
|
|
|
- self.doc_pdfplumber = PDF(self.fp, laparams=self.laparams.__dict__)
|
|
|
|
-
|
|
|
|
|
|
+ # self.fp = open(self.path, 'rb')
|
|
|
|
+ # self.lt = LineTable()
|
|
|
|
+ # self.doc_top = 0
|
|
|
|
+ # self.doc_pdfplumber = PDF(self.fp, laparams=self.laparams.__dict__)
|
|
|
|
+ self.lt, self.doc_top, self.doc_pdfplumber = read_pdfplumber(self.path, laparams)
|
|
|
|
+ self.has_init_pdf[3] = 0
|
|
else:
|
|
else:
|
|
print("Only Support Packages", str(self.packages))
|
|
print("Only Support Packages", str(self.packages))
|
|
raise Exception
|
|
raise Exception
|
|
- except:
|
|
|
|
|
|
+ except Exception as e:
|
|
log(package_name + " cannot open pdf!")
|
|
log(package_name + " cannot open pdf!")
|
|
|
|
+ traceback.print_exc()
|
|
self._doc.error_code = [-3]
|
|
self._doc.error_code = [-3]
|
|
|
|
|
|
def convert(self):
|
|
def convert(self):
|
|
@@ -720,7 +798,7 @@ class PDFConvert:
|
|
# image_count += 1
|
|
# image_count += 1
|
|
lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
|
|
lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
|
|
print("convert_pdf page", page_no)
|
|
print("convert_pdf page", page_no)
|
|
- print("len(lt_image_list), len(lt_text_list)", len(lt_image_list), len(lt_text_list))
|
|
|
|
|
|
+ log("len(lt_image_list), len(lt_text_list) " + str(len(lt_image_list)) + " " + str(len(lt_text_list)))
|
|
|
|
|
|
# 若只有文本且图片数为0,直接提取文字及表格
|
|
# 若只有文本且图片数为0,直接提取文字及表格
|
|
# if only_image == 0 and image_count == 0:
|
|
# if only_image == 0 and image_count == 0:
|
|
@@ -729,6 +807,15 @@ class PDFConvert:
|
|
if self.has_init_pdf[3] == 0:
|
|
if self.has_init_pdf[3] == 0:
|
|
self.init_package("pdfplumber")
|
|
self.init_package("pdfplumber")
|
|
if self._doc.error_code is not None:
|
|
if self._doc.error_code is not None:
|
|
|
|
+ self._doc.error_code = None
|
|
|
|
+ log("init pdfplumber failed! try pymupdf...")
|
|
|
|
+ # 调用pdfplumber获取pdf图片报错,则使用pypdf2将pdf转html
|
|
|
|
+ page_image = self.get_page_image(page_no)
|
|
|
|
+ if judge_error_code(page_image):
|
|
|
|
+ self._page.error_code = page_image
|
|
|
|
+ else:
|
|
|
|
+ _image = _Image(page_image[1], page_image[0])
|
|
|
|
+ self._page.add_child(_image)
|
|
return
|
|
return
|
|
|
|
|
|
# 无法识别pdf字符编码,整页用ocr
|
|
# 无法识别pdf字符编码,整页用ocr
|
|
@@ -737,6 +824,7 @@ class PDFConvert:
|
|
text_temp += _t.get_text()
|
|
text_temp += _t.get_text()
|
|
|
|
|
|
if re.search('[(]cid:[0-9]+[)]', text_temp):
|
|
if re.search('[(]cid:[0-9]+[)]', text_temp):
|
|
|
|
+ log("text has cid! try pymupdf...")
|
|
page_image = self.get_page_image(page_no)
|
|
page_image = self.get_page_image(page_no)
|
|
if judge_error_code(page_image):
|
|
if judge_error_code(page_image):
|
|
self._page.error_code = page_image
|
|
self._page.error_code = page_image
|
|
@@ -838,12 +926,13 @@ class PDFConvert:
|
|
self._page.add_child(_image)
|
|
self._page.add_child(_image)
|
|
except Exception:
|
|
except Exception:
|
|
log("pdf2text pdfminer read image in page " + str(page_no) +
|
|
log("pdf2text pdfminer read image in page " + str(page_no) +
|
|
- " fail! use pymupdf read image...")
|
|
|
|
|
|
+ " fail! use pymupdf read image...")
|
|
print(traceback.print_exc())
|
|
print(traceback.print_exc())
|
|
# pdf对象需反向排序
|
|
# pdf对象需反向排序
|
|
self._page.is_reverse = True
|
|
self._page.is_reverse = True
|
|
|
|
|
|
def get_layout(self, page, page_no):
|
|
def get_layout(self, page, page_no):
|
|
|
|
+ log("")
|
|
if self.has_init_pdf[0] == 0:
|
|
if self.has_init_pdf[0] == 0:
|
|
self.init_package("pdfminer")
|
|
self.init_package("pdfminer")
|
|
if self._doc.error_code is not None:
|
|
if self._doc.error_code is not None:
|
|
@@ -868,6 +957,7 @@ class PDFConvert:
|
|
return layout
|
|
return layout
|
|
|
|
|
|
def get_page_image(self, page_no):
|
|
def get_page_image(self, page_no):
|
|
|
|
+ log("")
|
|
try:
|
|
try:
|
|
if self.has_init_pdf[1] == 0:
|
|
if self.has_init_pdf[1] == 0:
|
|
self.init_package("PyMuPDF")
|
|
self.init_package("PyMuPDF")
|
|
@@ -905,6 +995,7 @@ class PDFConvert:
|
|
return [-3]
|
|
return [-3]
|
|
|
|
|
|
def get_all_page_image(self):
|
|
def get_all_page_image(self):
|
|
|
|
+ log("")
|
|
if self.has_init_pdf[1] == 0:
|
|
if self.has_init_pdf[1] == 0:
|
|
self.init_package("PyMuPDF")
|
|
self.init_package("PyMuPDF")
|
|
if self._doc.error_code is not None:
|
|
if self._doc.error_code is not None:
|
|
@@ -976,6 +1067,23 @@ class PDFConvert:
|
|
_img = cv2.resize(_img, (new_shape[1], new_shape[0]))
|
|
_img = cv2.resize(_img, (new_shape[1], new_shape[0]))
|
|
cv2.imwrite(img_path, _img)
|
|
cv2.imwrite(img_path, _img)
|
|
|
|
|
|
|
|
+ def get_single_pdf(self, path, page_no):
|
|
|
|
+ log("into get_single_pdf")
|
|
|
|
+ try:
|
|
|
|
+ pdf_origin = copy.deepcopy(self.doc_pypdf2)
|
|
|
|
+ pdf_new = copy.deepcopy(self.doc_pypdf2_new)
|
|
|
|
+ pdf_new.addPage(pdf_origin.getPage(page_no))
|
|
|
|
+
|
|
|
|
+ path_new = path.split(".")[0] + "_split.pdf"
|
|
|
|
+ with open(path_new, "wb") as ff:
|
|
|
|
+ pdf_new.write(ff)
|
|
|
|
+ return path_new
|
|
|
|
+ except PyPDF2.utils.PdfReadError as e:
|
|
|
|
+ return [-3]
|
|
|
|
+ except Exception as e:
|
|
|
|
+ log("get_single_pdf error! page " + str(page_no))
|
|
|
|
+ return [-3]
|
|
|
|
+
|
|
|
|
|
|
# 以下为现成pdf单页解析接口
|
|
# 以下为现成pdf单页解析接口
|
|
class ParseSentence:
|
|
class ParseSentence:
|