|
@@ -35,7 +35,7 @@ from pdfminer.converter import PDFPageAggregator
|
|
|
from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
|
|
|
LTTextBoxVertical, LTLine, LTTextContainer
|
|
|
from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
|
|
|
- get_logger, log, memory_decorator, draw_lines_plt, get_garble_code, line_is_cross
|
|
|
+ get_logger, log, memory_decorator, draw_lines_plt, get_garble_code, line_is_cross, get_md5_from_bytes, bytes2np
|
|
|
import fitz
|
|
|
from format_convert.wrapt_timeout_decorator import timeout
|
|
|
|
|
@@ -689,6 +689,9 @@ class PDFConvert:
|
|
|
self.packages = ["pdfminer", "PyMuPDF", "PyPDF2", "pdfplumber"]
|
|
|
self.has_init_pdf = [0] * len(self.packages)
|
|
|
|
|
|
+ # 记录图片对象的md5,用于去除大量重复图片
|
|
|
+ self.md5_image_obj_list = []
|
|
|
+
|
|
|
@memory_decorator
|
|
|
def init_package(self, package_name):
|
|
|
# 各个包初始化
|
|
@@ -800,6 +803,41 @@ class PDFConvert:
|
|
|
self._doc.add_child(self._page)
|
|
|
page_no += 1
|
|
|
|
|
|
+ self.delete_same_image()
|
|
|
+
|
|
|
+ def delete_same_image(self, show=0):
|
|
|
+ # 剔除大量重复图片
|
|
|
+ md5_dict = {}
|
|
|
+ for _md5, image_obj in self.md5_image_obj_list:
|
|
|
+ if _md5 in md5_dict.keys():
|
|
|
+ md5_dict[_md5] += [image_obj]
|
|
|
+ else:
|
|
|
+ md5_dict[_md5] = [image_obj]
|
|
|
+ cnt_threshold = 10
|
|
|
+ delete_obj_list = []
|
|
|
+ for _md5 in md5_dict.keys():
|
|
|
+ img_list = md5_dict.get(_md5)
|
|
|
+ print('len(md5_dict.get(_md5))', _md5, len(img_list))
|
|
|
+ if len(img_list) >= cnt_threshold:
|
|
|
+ if show:
|
|
|
+ img_np = bytes2np(img_list[0].content)
|
|
|
+ cv2.namedWindow('delete same img_np', cv2.WINDOW_NORMAL)
|
|
|
+ cv2.imshow('delete same img_np', img_np)
|
|
|
+ cv2.waitKey(0)
|
|
|
+ delete_obj_list += img_list
|
|
|
+ for page in self._doc.children:
|
|
|
+ for obj in delete_obj_list:
|
|
|
+ if obj in page.children:
|
|
|
+ page.children.remove(obj)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ for page in self._doc.children:
|
|
|
+ for obj in page.children:
|
|
|
+ if isinstance(obj, _Image):
|
|
|
+ img_np = bytes2np(obj.content)
|
|
|
+ cv2.imshow('page img_np', img_np)
|
|
|
+ cv2.waitKey(0)
|
|
|
+
|
|
|
def clean_text(self, _text):
|
|
|
return re.sub("\s", "", _text)
|
|
|
|
|
@@ -1116,6 +1154,32 @@ class PDFConvert:
|
|
|
_line_list.remove(col)
|
|
|
return _line_list
|
|
|
|
|
|
+ def cross_line_process(_cross_line_list, _bias_line_list):
|
|
|
+ # 斜线校正
|
|
|
+ if _cross_line_list:
|
|
|
+ _cross_line_list = repair_bias_line(_cross_line_list)
|
|
|
+
|
|
|
+ # 修复竖线
|
|
|
+ if _bias_line_list:
|
|
|
+ _cross_line_list = repair_col_line(_cross_line_list, _bias_line_list)
|
|
|
+
|
|
|
+ # 根据是否有交点判断表格线
|
|
|
+ _cross_line_list = get_cross_line(_cross_line_list, threshold=1, cross_times=1)
|
|
|
+
|
|
|
+ # 合并线条
|
|
|
+ if not _cross_line_list:
|
|
|
+ return []
|
|
|
+ _cross_line_list = merge_line(_cross_line_list)
|
|
|
+
|
|
|
+ # 删除最外层嵌套边框
|
|
|
+ _cross_line_list = remove_outline_no_cross(_cross_line_list)
|
|
|
+
|
|
|
+ # 复用otr的部分后处理,补线
|
|
|
+ from otr.table_line_new import table_line_pdf
|
|
|
+ _cross_line_list = table_line_pdf(_cross_line_list, page_w, page_h)
|
|
|
+
|
|
|
+ return _cross_line_list
|
|
|
+
|
|
|
log('into get_page_lines')
|
|
|
|
|
|
page_h = layout.height
|
|
@@ -1142,12 +1206,18 @@ class PDFConvert:
|
|
|
continue
|
|
|
line_list.append(element.bbox)
|
|
|
|
|
|
+ if show:
|
|
|
+ print('get_page_lines line_list', line_list)
|
|
|
+ print('get_page_lines bias_line_list', bias_line_list)
|
|
|
+ _plot(line_list+bias_line_list, mode=2)
|
|
|
if not line_list and not bias_line_list:
|
|
|
return []
|
|
|
|
|
|
# 是否使用斜线来生成表格
|
|
|
+ line_list_copy = copy.deepcopy(line_list)
|
|
|
if len(line_list) < 6 and len(bias_line_list) > len(line_list) * 2:
|
|
|
- # print('use bias line')
|
|
|
+ if show:
|
|
|
+ print('use bias line')
|
|
|
# bias_line_list += add_col_bias_line(line_list, bias_line_list)
|
|
|
line_list = bias_line_list
|
|
|
|
|
@@ -1156,34 +1226,26 @@ class PDFConvert:
|
|
|
line_list = list(set(line_list))
|
|
|
line_list = [eval(x) for x in line_list]
|
|
|
|
|
|
- # 根据是否有交点判断表格线
|
|
|
- cross_line_list = get_cross_line(line_list, threshold=2, cross_times=1)
|
|
|
-
|
|
|
- if not cross_line_list:
|
|
|
- return []
|
|
|
-
|
|
|
- # 斜线校正
|
|
|
- if cross_line_list:
|
|
|
- cross_line_list = repair_bias_line(cross_line_list)
|
|
|
-
|
|
|
- # 修复竖线
|
|
|
- if bias_line_list:
|
|
|
- cross_line_list = repair_col_line(cross_line_list, bias_line_list)
|
|
|
+ if show:
|
|
|
+ _plot(line_list, mode=2)
|
|
|
|
|
|
# 根据是否有交点判断表格线
|
|
|
- cross_line_list = get_cross_line(cross_line_list, threshold=1, cross_times=1)
|
|
|
+ cross_line_list = get_cross_line(line_list_copy+bias_line_list, threshold=2, cross_times=1)
|
|
|
|
|
|
- # 合并线条
|
|
|
+ if show:
|
|
|
+ print('get_page_lines cross_line_list', cross_line_list)
|
|
|
if not cross_line_list:
|
|
|
- return []
|
|
|
- cross_line_list = merge_line(cross_line_list)
|
|
|
-
|
|
|
- # 删除最外层嵌套边框
|
|
|
- cross_line_list = remove_outline_no_cross(cross_line_list)
|
|
|
+ # 将线全部合并再获取一次
|
|
|
+ cross_line_list = get_cross_line(line_list_copy+bias_line_list, threshold=2, cross_times=1)
|
|
|
+ if not cross_line_list:
|
|
|
+ return []
|
|
|
|
|
|
- # 复用otr的部分后处理,补线
|
|
|
- from otr.table_line_new import table_line_pdf
|
|
|
- cross_line_list = table_line_pdf(cross_line_list, page_w, page_h)
|
|
|
+ cross_line_list = cross_line_process(cross_line_list, bias_line_list)
|
|
|
+ if not cross_line_list:
|
|
|
+ cross_line_list = get_cross_line(line_list_copy+bias_line_list, threshold=2, cross_times=1)
|
|
|
+ cross_line_list = cross_line_process(cross_line_list, bias_line_list)
|
|
|
+ if show:
|
|
|
+ print('get_page_lines cross_line_list2', cross_line_list)
|
|
|
|
|
|
# show
|
|
|
if show:
|
|
@@ -1287,6 +1349,15 @@ class PDFConvert:
|
|
|
# 水印行跳过
|
|
|
if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
|
|
|
continue
|
|
|
+ # 目录行跳过
|
|
|
+ continue_flag = False
|
|
|
+ for r in row:
|
|
|
+ if re.search('[.·]{7,}', r.get_text()):
|
|
|
+ continue_flag = True
|
|
|
+ break
|
|
|
+ if continue_flag:
|
|
|
+ continue
|
|
|
+
|
|
|
if len(row) == 1:
|
|
|
text = row[0].get_text()
|
|
|
bbox = row[0].bbox
|
|
@@ -1359,7 +1430,7 @@ class PDFConvert:
|
|
|
lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
|
|
|
log("convert_pdf page " + str(page_no))
|
|
|
log("len(lt_image_list), len(lt_text_list) " + str(len(lt_image_list)) + " " + str(len(lt_text_list)))
|
|
|
- log('layout.width, layout.height' + str(layout.width) + str(layout.height))
|
|
|
+ log('layout.width, layout.height ' + str(layout.width) + str(layout.height))
|
|
|
|
|
|
# 若只有文本且图片数为0,直接提取文字及表格
|
|
|
# if only_image == 0 and image_count == 0:
|
|
@@ -1412,7 +1483,7 @@ class PDFConvert:
|
|
|
|
|
|
# 若该页图片数量过多,或无文本,则直接ocr整页识别
|
|
|
# elif image_count > 3 or only_image == 1:
|
|
|
- if len(lt_image_list) > 3 or len(lt_text_list) == 0:
|
|
|
+ if len(lt_image_list) > 4 or len(lt_text_list) == 0:
|
|
|
page_image = self.get_page_image(page_no)
|
|
|
if judge_error_code(page_image):
|
|
|
self._page.error_code = page_image
|
|
@@ -1441,6 +1512,8 @@ class PDFConvert:
|
|
|
_image = _Image(page_image[1], page_image[0])
|
|
|
_image.is_from_pdf = True
|
|
|
self._page.add_child(_image)
|
|
|
+ image_md5 = get_md5_from_bytes(page_image[1])
|
|
|
+ self.md5_image_obj_list.append([image_md5, _image])
|
|
|
return
|
|
|
# 比较小的图则直接保存用ocr识别
|
|
|
else:
|
|
@@ -1451,6 +1524,8 @@ class PDFConvert:
|
|
|
image_stream = ff.read()
|
|
|
_image = _Image(image_stream, temp_path, image.bbox)
|
|
|
self._page.add_child(_image)
|
|
|
+ image_md5 = get_md5_from_bytes(image_stream)
|
|
|
+ self.md5_image_obj_list.append([image_md5, _image])
|
|
|
except Exception:
|
|
|
log("pdf2text pdfminer read image in page " + str(page_no) +
|
|
|
" fail! use pymupdf read image...")
|
|
@@ -1580,6 +1655,7 @@ class PDFConvert:
|
|
|
# 0: 前一页最后一个表格为A,后一页第一个表格为B
|
|
|
# 1.1: A后无文本(除了页码),且B前无文本(除了页码)
|
|
|
# 1.2: B前有文字(可能是页眉,小于60字),且B的第一行前几个单元格为空,且第一行不为空的单元格有文字较多的格子
|
|
|
+ # 1.3: B前有文字(可能是页眉,小于60字),且B的第一行第一个单元格为空,且有文字的格子数量占所有格子的一半
|
|
|
connect_flag_list = []
|
|
|
soup_list = []
|
|
|
for i, h in enumerate(html_list):
|
|
@@ -1622,12 +1698,16 @@ class PDFConvert:
|
|
|
if rows:
|
|
|
first_row = rows[0]
|
|
|
col_text_list = [len(x.text) for x in first_row]
|
|
|
+ # 文字大于60且第一个为空
|
|
|
if len(h[:first_table_start]) <= 60 and col_text_list[0] == 0 and max(col_text_list) >= 30:
|
|
|
connect_flag2 = True
|
|
|
+ # 有文字格子数占一半一下且第一个格子为空
|
|
|
+ elif col_text_list.count(0) >= len(col_text_list) / 2 and col_text_list[0] == 0:
|
|
|
+ connect_flag2 = True
|
|
|
|
|
|
connect_flag_list.append([i, connect_flag2, connect_flag1])
|
|
|
|
|
|
- # print('connect_flag_list', connect_flag_list)
|
|
|
+ print('connect_flag_list', connect_flag_list)
|
|
|
|
|
|
# 根据条件1合并需连接页码,形成组
|
|
|
connect_pages_list = []
|
|
@@ -1645,7 +1725,7 @@ class PDFConvert:
|
|
|
if temp_list:
|
|
|
connect_pages_list.append(temp_list)
|
|
|
|
|
|
- # print('connect_pages_list', connect_pages_list)
|
|
|
+ print('connect_pages_list', connect_pages_list)
|
|
|
|
|
|
# 判断后续条件:判断组内列数是否相同
|
|
|
connect_pages_list2 = []
|
|
@@ -1654,6 +1734,8 @@ class PDFConvert:
|
|
|
connect_pages_list2.append(c_list)
|
|
|
else:
|
|
|
col_cnt_list = []
|
|
|
+ # 单元格可能被复制了,相同的合并当做一列
|
|
|
+ merge_col_cnt_list = []
|
|
|
for c in c_list:
|
|
|
soup = soup_list[c[0]]
|
|
|
table1 = soup.findAll('table')[-1]
|
|
@@ -1663,10 +1745,32 @@ class PDFConvert:
|
|
|
td1 = tr1[-1].findAll('td')
|
|
|
td2 = tr2[0].findAll('td')
|
|
|
col_cnt_list.append([len(td2), len(td1)])
|
|
|
+
|
|
|
+ # # 计算合并重复文本格子后的列数
|
|
|
+ # last_text = td1[0].text
|
|
|
+ # merge_td1 = [last_text]
|
|
|
+ # for td in td1:
|
|
|
+ # if td.text == last_text:
|
|
|
+ # continue
|
|
|
+ # else:
|
|
|
+ # merge_td1.append(td.text)
|
|
|
+ # last_text = td.text
|
|
|
+ # last_text = td2[0].text
|
|
|
+ # merge_td2 = [last_text]
|
|
|
+ # for td in td2:
|
|
|
+ # if td.text == last_text:
|
|
|
+ # continue
|
|
|
+ # else:
|
|
|
+ # merge_td2.append(td.text)
|
|
|
+ # last_text = td.text
|
|
|
+ # merge_col_cnt_list.append([len(merge_td2), len(merge_td1)])
|
|
|
+
|
|
|
+ # 判断
|
|
|
new_c_list = [c_list[0]]
|
|
|
# print('col_cnt_list', col_cnt_list)
|
|
|
for i in range(len(col_cnt_list) - 1):
|
|
|
if col_cnt_list[i][1] != col_cnt_list[i + 1][0]:
|
|
|
+ # and merge_col_cnt_list[i][1] != merge_col_cnt_list[i + 1][0]:
|
|
|
connect_pages_list2.append(new_c_list)
|
|
|
new_c_list = [c_list[i + 1]]
|
|
|
else:
|
|
@@ -1674,7 +1778,7 @@ class PDFConvert:
|
|
|
if new_c_list:
|
|
|
connect_pages_list2.append(new_c_list)
|
|
|
|
|
|
- # print('connect_pages_list2', connect_pages_list2)
|
|
|
+ print('connect_pages_list2', connect_pages_list2)
|
|
|
|
|
|
# 符合连接条件的拼接表格
|
|
|
new_html_list = []
|