|
@@ -772,7 +772,54 @@ class PDFConvert:
|
|
|
return re.sub("\s","",_text)
|
|
|
|
|
|
|
|
|
-
|
|
|
+ def get_text_lines(self,page,page_no):
|
|
|
+ lt_line_list = []
|
|
|
+ page_plumber = pdfPage(self.doc_pdfplumber, page, page_number=page_no, initial_doctop=self.doc_top)
|
|
|
+ self.doc_top += page_plumber.height
|
|
|
+
|
|
|
+ table_finder = TableFinder(page_plumber)
|
|
|
+ for _edge in table_finder.get_edges():
|
|
|
+ lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
|
|
|
+ (float(_edge["x1"]), float(_edge["y1"]))))
|
|
|
+ log("pdf page %s has %s lines"%(str(page_no),str(len(lt_line_list))))
|
|
|
+ return lt_line_list
|
|
|
+
|
|
|
+ def recognize_text(self,layout,page_no,lt_text_list,lt_line_list):
|
|
|
+ list_tables, filter_objs, _ = self.lt.recognize_table(lt_text_list, lt_line_list)
|
|
|
+ self._page.in_table_objs = filter_objs
|
|
|
+
|
|
|
+ print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
|
|
|
+
|
|
|
+ for table in list_tables:
|
|
|
+ _table = _Table(table["table"], table["bbox"])
|
|
|
+ # self._page.children.append(_table)
|
|
|
+ self._page.add_child(_table)
|
|
|
+
|
|
|
+ list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
|
|
|
+ layout.bbox, page_no)
|
|
|
+
|
|
|
+ for sentence in list_sentences:
|
|
|
+ _sen = _Sentence(sentence.text, sentence.bbox)
|
|
|
+ self._page.add_child(_sen)
|
|
|
+ # pdf对象需反向排序
|
|
|
+ self._page.is_reverse = True
|
|
|
+
|
|
|
+ def is_text_legal(self,lt_text_list,page_no):
|
|
|
+ # 无法识别pdf字符编码,整页用ocr
|
|
|
+ text_temp = ""
|
|
|
+ for _t in lt_text_list:
|
|
|
+ text_temp += _t.get_text()
|
|
|
+
|
|
|
+ if re.search('[(]cid:[0-9]+[)]', text_temp):
|
|
|
+ log("text has cid! try pymupdf...")
|
|
|
+ page_image = self.get_page_image(page_no)
|
|
|
+ if judge_error_code(page_image):
|
|
|
+ self._page.error_code = page_image
|
|
|
+ else:
|
|
|
+ _image = _Image(page_image[1], page_image[0])
|
|
|
+ self._page.add_child(_image)
|
|
|
+ return False
|
|
|
+ return True
|
|
|
|
|
|
def convert_page(self, page, page_no):
|
|
|
# pdf page.annots为None,不经过get_layout,直接ocr
|
|
@@ -784,6 +831,8 @@ class PDFConvert:
|
|
|
# image_count = 1
|
|
|
# else:
|
|
|
layout = self.get_layout(page, page_no)
|
|
|
+ self.width = layout.width
|
|
|
+ self.height = layout.height
|
|
|
if self._doc.error_code is not None:
|
|
|
return
|
|
|
if judge_error_code(layout):
|
|
@@ -826,51 +875,14 @@ class PDFConvert:
|
|
|
self._page.add_child(_image)
|
|
|
return
|
|
|
|
|
|
- # 无法识别pdf字符编码,整页用ocr
|
|
|
- text_temp = ""
|
|
|
- for _t in lt_text_list:
|
|
|
- text_temp += _t.get_text()
|
|
|
|
|
|
- if re.search('[(]cid:[0-9]+[)]', text_temp):
|
|
|
- log("text has cid! try pymupdf...")
|
|
|
- page_image = self.get_page_image(page_no)
|
|
|
- if judge_error_code(page_image):
|
|
|
- self._page.error_code = page_image
|
|
|
- else:
|
|
|
- _image = _Image(page_image[1], page_image[0])
|
|
|
- self._page.add_child(_image)
|
|
|
+ if not self.is_text_legal(lt_text_list,page_no):
|
|
|
return
|
|
|
|
|
|
try:
|
|
|
- lt_line_list = []
|
|
|
- page_plumber = pdfPage(self.doc_pdfplumber, page, page_number=page_no, initial_doctop=self.doc_top)
|
|
|
- self.doc_top += page_plumber.height
|
|
|
-
|
|
|
- table_finder = TableFinder(page_plumber)
|
|
|
- for _edge in table_finder.get_edges():
|
|
|
- lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
|
|
|
- (float(_edge["x1"]), float(_edge["y1"]))))
|
|
|
-
|
|
|
-
|
|
|
- #draw lines to check
|
|
|
- # draw_lines_plt([l.bbox for l in lt_line_list])
|
|
|
-
|
|
|
- list_tables, filter_objs, _ = self.lt.recognize_table(lt_text_list, lt_line_list)
|
|
|
- self._page.in_table_objs = filter_objs
|
|
|
-
|
|
|
- for table in list_tables:
|
|
|
- _table = _Table(table["table"], table["bbox"])
|
|
|
- # self._page.children.append(_table)
|
|
|
- self._page.add_child(_table)
|
|
|
-
|
|
|
- list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
|
|
|
- layout.bbox, page_no)
|
|
|
+ lt_line_list = self.get_text_lines(page,page_no)
|
|
|
+ self.recognize_text(layout,page_no,lt_text_list,lt_line_list)
|
|
|
|
|
|
- for sentence in list_sentences:
|
|
|
- _sen = _Sentence(sentence.text, sentence.bbox)
|
|
|
- self._page.add_child(_sen)
|
|
|
- # pdf对象需反向排序
|
|
|
- self._page.is_reverse = True
|
|
|
except:
|
|
|
traceback.print_exc()
|
|
|
self._page.error_code = [-8]
|
|
@@ -888,25 +900,6 @@ class PDFConvert:
|
|
|
|
|
|
# 正常读取该页对象
|
|
|
else:
|
|
|
- # 文本对象
|
|
|
- for x in lt_text_list:
|
|
|
- # 获取对象文本
|
|
|
- object_text = x.get_text()
|
|
|
-
|
|
|
- # 无法识别pdf字符编码,整页用ocr
|
|
|
- if re.search('[(]cid:[0-9]+[)]', object_text):
|
|
|
- page_image = self.get_page_image(page_no)
|
|
|
- if judge_error_code(page_image):
|
|
|
- self._page.error_code = page_image
|
|
|
- else:
|
|
|
- _image = _Image(page_image[1], page_image[0])
|
|
|
- self._page.add_child(_image)
|
|
|
- return
|
|
|
- else:
|
|
|
- _sen = _Sentence(object_text, x.bbox)
|
|
|
- # _sen.x = x.bbox[0]
|
|
|
- # _sen.y = x.bbox[1]
|
|
|
- self._page.add_child(_sen)
|
|
|
|
|
|
# 图表对象
|
|
|
for image in lt_image_list:
|
|
@@ -918,7 +911,8 @@ class PDFConvert:
|
|
|
continue
|
|
|
# 查看提取的图片高宽,太大则用pdf输出图进行ocr识别
|
|
|
img_test = Image.open(io.BytesIO(image_stream))
|
|
|
- if img_test.size[1] > 2000 or img_test.size[0] > 1500:
|
|
|
+ # img_test.show()
|
|
|
+ if image.height >= self.height-100 and image.width >= self.width-100:
|
|
|
print("pdf2text LTImage stream output size", img_test.size)
|
|
|
page_image = self.get_page_image(page_no)
|
|
|
if judge_error_code(page_image):
|
|
@@ -943,6 +937,13 @@ class PDFConvert:
|
|
|
print(traceback.print_exc())
|
|
|
# pdf对象需反向排序
|
|
|
self._page.is_reverse = True
|
|
|
+ self.init_package("pdfplumber")
|
|
|
+
|
|
|
+ if not self.is_text_legal(lt_text_list,page_no):
|
|
|
+ return
|
|
|
+
|
|
|
+ lt_line_list = self.get_text_lines(page,page_no)
|
|
|
+ self.recognize_text(layout,page_no,lt_text_list,lt_line_list)
|
|
|
|
|
|
def get_layout(self, page, page_no):
|
|
|
log("")
|