|
@@ -1181,6 +1181,10 @@ class PDFConvert:
|
|
# 删除最外层嵌套边框
|
|
# 删除最外层嵌套边框
|
|
cross_line_list = remove_outline_no_cross(cross_line_list)
|
|
cross_line_list = remove_outline_no_cross(cross_line_list)
|
|
|
|
|
|
|
|
+ # 复用otr的部分后处理,补线
|
|
|
|
+ from otr.table_line_new import table_line_pdf
|
|
|
|
+ cross_line_list = table_line_pdf(cross_line_list, page_w, page_h)
|
|
|
|
+
|
|
# show
|
|
# show
|
|
if show:
|
|
if show:
|
|
print('len(cross_line_list)', len(cross_line_list))
|
|
print('len(cross_line_list)', len(cross_line_list))
|
|
@@ -1214,6 +1218,8 @@ class PDFConvert:
|
|
# pdf对象需反向排序
|
|
# pdf对象需反向排序
|
|
self._page.is_reverse = True
|
|
self._page.is_reverse = True
|
|
|
|
|
|
|
|
+ return list_tables
|
|
|
|
+
|
|
def is_text_legal(self, lt_text_list, page_no):
|
|
def is_text_legal(self, lt_text_list, page_no):
|
|
# 无法识别pdf字符编码,整页用ocr
|
|
# 无法识别pdf字符编码,整页用ocr
|
|
text_temp = ""
|
|
text_temp = ""
|
|
@@ -1244,7 +1250,11 @@ class PDFConvert:
|
|
|
|
|
|
return True
|
|
return True
|
|
|
|
|
|
- def judge_b_table(self, lt_text_list):
|
|
|
|
|
|
+ def judge_b_table(self, lt_text_list, table_list):
|
|
|
|
+ table_h_list = []
|
|
|
|
+ for table in table_list:
|
|
|
|
+ table_h_list.append([table.get('bbox')[1], table.get('bbox')[3]])
|
|
|
|
+
|
|
# 先分行
|
|
# 先分行
|
|
lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
|
|
lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
|
|
lt_text_row_list = []
|
|
lt_text_row_list = []
|
|
@@ -1272,6 +1282,7 @@ class PDFConvert:
|
|
tolerate_cnt = 2
|
|
tolerate_cnt = 2
|
|
t_cnt = 0
|
|
t_cnt = 0
|
|
row_cnt = 0
|
|
row_cnt = 0
|
|
|
|
+ b_table_row_list = []
|
|
for row in lt_text_row_list:
|
|
for row in lt_text_row_list:
|
|
# 水印行跳过
|
|
# 水印行跳过
|
|
if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
|
|
if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
|
|
@@ -1284,19 +1295,40 @@ class PDFConvert:
|
|
and re.search('[\u4e00-\u9fff]{2,}', text[match.span()[1]:]):
|
|
and re.search('[\u4e00-\u9fff]{2,}', text[match.span()[1]:]):
|
|
row_cnt += 1
|
|
row_cnt += 1
|
|
t_cnt = 0
|
|
t_cnt = 0
|
|
|
|
+ b_table_row_list += row
|
|
else:
|
|
else:
|
|
# 容忍
|
|
# 容忍
|
|
if t_cnt < tolerate_cnt:
|
|
if t_cnt < tolerate_cnt:
|
|
t_cnt += 1
|
|
t_cnt += 1
|
|
continue
|
|
continue
|
|
row_cnt = 0
|
|
row_cnt = 0
|
|
|
|
+ b_table_row_list = []
|
|
else:
|
|
else:
|
|
row_cnt += 1
|
|
row_cnt += 1
|
|
t_cnt = 0
|
|
t_cnt = 0
|
|
|
|
+ b_table_row_list += row
|
|
|
|
|
|
if row_cnt >= is_b_table_cnt:
|
|
if row_cnt >= is_b_table_cnt:
|
|
- is_b_table_flag = True
|
|
|
|
- break
|
|
|
|
|
|
+ # 判断在不在有边框表格的范围
|
|
|
|
+ in_flag = False
|
|
|
|
+ for table_h in table_h_list:
|
|
|
|
+ for b in b_table_row_list:
|
|
|
|
+ # print('b.bbox', b.bbox)
|
|
|
|
+ # print(table_h)
|
|
|
|
+ if table_h[1] <= b.bbox[1] <= table_h[0] or table_h[1] <= b.bbox[3] <= table_h[0]:
|
|
|
|
+ in_flag = True
|
|
|
|
+ break
|
|
|
|
+ if in_flag:
|
|
|
|
+ break
|
|
|
|
+ if in_flag:
|
|
|
|
+ is_b_table_flag = False
|
|
|
|
+ t_cnt = 0
|
|
|
|
+ row_cnt = 0
|
|
|
|
+ else:
|
|
|
|
+ print('True b_table_row_list', b_table_row_list)
|
|
|
|
+ print('table_h_list', table_h_list)
|
|
|
|
+ is_b_table_flag = True
|
|
|
|
+ break
|
|
log('pdf is_b_table_flag ' + str(is_b_table_flag))
|
|
log('pdf is_b_table_flag ' + str(is_b_table_flag))
|
|
return is_b_table_flag
|
|
return is_b_table_flag
|
|
|
|
|
|
@@ -1430,8 +1462,16 @@ class PDFConvert:
|
|
if not self.is_text_legal(lt_text_list, page_no):
|
|
if not self.is_text_legal(lt_text_list, page_no):
|
|
return
|
|
return
|
|
|
|
|
|
|
|
+ try:
|
|
|
|
+ lt_line_list = self.get_page_lines(layout, page_no)
|
|
|
|
+ except:
|
|
|
|
+ traceback.print_exc()
|
|
|
|
+ lt_line_list = []
|
|
|
|
+ self._page.error_code = [-13]
|
|
|
|
+ table_list = self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
|
|
|
|
+
|
|
# 根据text规律,判断该页是否可能有无边框表格
|
|
# 根据text规律,判断该页是否可能有无边框表格
|
|
- if self.judge_b_table(lt_text_list):
|
|
|
|
|
|
+ if self.judge_b_table(lt_text_list, table_list):
|
|
page_image = self.get_page_image(page_no)
|
|
page_image = self.get_page_image(page_no)
|
|
if judge_error_code(page_image):
|
|
if judge_error_code(page_image):
|
|
self._page.error_code = page_image
|
|
self._page.error_code = page_image
|
|
@@ -1443,14 +1483,6 @@ class PDFConvert:
|
|
_image.b_table_layout_size = (layout.width, layout.height)
|
|
_image.b_table_layout_size = (layout.width, layout.height)
|
|
self._page.add_child(_image)
|
|
self._page.add_child(_image)
|
|
|
|
|
|
- try:
|
|
|
|
- lt_line_list = self.get_page_lines(layout, page_no)
|
|
|
|
- except:
|
|
|
|
- traceback.print_exc()
|
|
|
|
- lt_line_list = []
|
|
|
|
- self._page.error_code = [-13]
|
|
|
|
- self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
|
|
|
|
-
|
|
|
|
def get_layout(self, page, page_no):
|
|
def get_layout(self, page, page_no):
|
|
log("get_layout")
|
|
log("get_layout")
|
|
if self.has_init_pdf[0] == 0:
|
|
if self.has_init_pdf[0] == 0:
|
|
@@ -1566,7 +1598,7 @@ class PDFConvert:
|
|
# 最后一个表格后有无除了页码外的内容
|
|
# 最后一个表格后有无除了页码外的内容
|
|
connect_flag1 = False
|
|
connect_flag1 = False
|
|
if last_table_end is not None:
|
|
if last_table_end is not None:
|
|
- match = re.search('[^-/第页0-9]*', re.sub('<div>|</div>', '', h[last_table_end:]))
|
|
|
|
|
|
+ match = re.search('[^-/第页0-9,,]*', re.sub('<div>|</div>', '', h[last_table_end:]))
|
|
# print('match1', match.group())
|
|
# print('match1', match.group())
|
|
if not match or match.group() == '':
|
|
if not match or match.group() == '':
|
|
connect_flag1 = True
|
|
connect_flag1 = True
|
|
@@ -1595,7 +1627,7 @@ class PDFConvert:
|
|
|
|
|
|
connect_flag_list.append([i, connect_flag2, connect_flag1])
|
|
connect_flag_list.append([i, connect_flag2, connect_flag1])
|
|
|
|
|
|
- print('connect_flag_list', connect_flag_list)
|
|
|
|
|
|
+ # print('connect_flag_list', connect_flag_list)
|
|
|
|
|
|
# 根据条件1合并需连接页码,形成组
|
|
# 根据条件1合并需连接页码,形成组
|
|
connect_pages_list = []
|
|
connect_pages_list = []
|
|
@@ -1613,7 +1645,7 @@ class PDFConvert:
|
|
if temp_list:
|
|
if temp_list:
|
|
connect_pages_list.append(temp_list)
|
|
connect_pages_list.append(temp_list)
|
|
|
|
|
|
- print('connect_pages_list', connect_pages_list)
|
|
|
|
|
|
+ # print('connect_pages_list', connect_pages_list)
|
|
|
|
|
|
# 判断后续条件:判断组内列数是否相同
|
|
# 判断后续条件:判断组内列数是否相同
|
|
connect_pages_list2 = []
|
|
connect_pages_list2 = []
|
|
@@ -1642,7 +1674,7 @@ class PDFConvert:
|
|
if new_c_list:
|
|
if new_c_list:
|
|
connect_pages_list2.append(new_c_list)
|
|
connect_pages_list2.append(new_c_list)
|
|
|
|
|
|
- print('connect_pages_list2', connect_pages_list2)
|
|
|
|
|
|
+ # print('connect_pages_list2', connect_pages_list2)
|
|
|
|
|
|
# 符合连接条件的拼接表格
|
|
# 符合连接条件的拼接表格
|
|
new_html_list = []
|
|
new_html_list = []
|
|
@@ -1656,10 +1688,10 @@ class PDFConvert:
|
|
new_html += html_list[c[0]] + '#@#@#'
|
|
new_html += html_list[c[0]] + '#@#@#'
|
|
new_html = new_html[:-5]
|
|
new_html = new_html[:-5]
|
|
# ([-/第页0-9]|<div>|</div>)*
|
|
# ([-/第页0-9]|<div>|</div>)*
|
|
- new_html = re.sub('</table>((<div>[-/第页0-9]*</div>#@#@#)|(#@#@#<div>[^<]*</div>)|#@#@#)<table border="1">',
|
|
|
|
|
|
+ new_html = re.sub('</table>((<div>[-/第页0-9,,]*</div>#@#@#)|(#@#@#<div>[^<]*</div>)|#@#@#)<table border="1">',
|
|
'<tr><td>#@#@#</td></tr>',
|
|
'<tr><td>#@#@#</td></tr>',
|
|
new_html)
|
|
new_html)
|
|
- print('new_html', new_html)
|
|
|
|
|
|
+ # print('new_html', new_html)
|
|
|
|
|
|
soup = BeautifulSoup(new_html, 'lxml')
|
|
soup = BeautifulSoup(new_html, 'lxml')
|
|
trs = soup.findAll('tr')
|
|
trs = soup.findAll('tr')
|