|
@@ -822,7 +822,7 @@ class PDFConvert:
|
|
log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
|
|
log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
|
|
return lt_line_list
|
|
return lt_line_list
|
|
|
|
|
|
- def get_page_lines(self, layout, page_no):
|
|
|
|
|
|
+ def get_page_lines(self, layout, page_no, show=0):
|
|
def _plot(_line_list, mode=1):
|
|
def _plot(_line_list, mode=1):
|
|
for _line in _line_list:
|
|
for _line in _line_list:
|
|
if mode == 1:
|
|
if mode == 1:
|
|
@@ -1180,10 +1180,12 @@ class PDFConvert:
|
|
|
|
|
|
# 删除最外层嵌套边框
|
|
# 删除最外层嵌套边框
|
|
cross_line_list = remove_outline_no_cross(cross_line_list)
|
|
cross_line_list = remove_outline_no_cross(cross_line_list)
|
|
|
|
+
|
|
# show
|
|
# show
|
|
- # print('len(cross_line_list)', len(cross_line_list))
|
|
|
|
- # _plot(line_list, mode=2)
|
|
|
|
- # _plot(cross_line_list, mode=2)
|
|
|
|
|
|
+ if show:
|
|
|
|
+ print('len(cross_line_list)', len(cross_line_list))
|
|
|
|
+ # _plot(line_list, mode=2)
|
|
|
|
+ _plot(cross_line_list, mode=2)
|
|
|
|
|
|
lt_line_list = []
|
|
lt_line_list = []
|
|
for line in cross_line_list:
|
|
for line in cross_line_list:
|
|
@@ -1193,7 +1195,7 @@ class PDFConvert:
|
|
return lt_line_list
|
|
return lt_line_list
|
|
|
|
|
|
def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
|
|
def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
|
|
- list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list)
|
|
|
|
|
|
+ list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list, from_pdf=True)
|
|
self._page.in_table_objs = filter_objs
|
|
self._page.in_table_objs = filter_objs
|
|
|
|
|
|
# print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
|
|
# print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
|
|
@@ -1542,42 +1544,58 @@ class PDFConvert:
|
|
if not html_list:
|
|
if not html_list:
|
|
return html_list
|
|
return html_list
|
|
|
|
|
|
- # 判断条件1:最后一个表格后有无非页码文本/第一个表格前有无文本
|
|
|
|
|
|
+ # 判断初始条件1
|
|
|
|
+ # 0: 前一页最后一个表格为A,后一页第一个表格为B
|
|
|
|
+ # 1.1: A后无文本(除了页码),且B前无文本(除了页码)
|
|
|
|
+ # 1.2: B前有文字(可能是页眉,小于60字),且B的第一行前几个单元格为空,且第一行不为空的单元格有文字较多的格子
|
|
connect_flag_list = []
|
|
connect_flag_list = []
|
|
soup_list = []
|
|
soup_list = []
|
|
for i, h in enumerate(html_list):
|
|
for i, h in enumerate(html_list):
|
|
- soup_list.append(BeautifulSoup(h, 'lxml'))
|
|
|
|
|
|
+ soup = BeautifulSoup(h, 'lxml')
|
|
|
|
+ soup_list.append(soup)
|
|
# 找最后一个表格
|
|
# 找最后一个表格
|
|
- table_start1, table_end1 = None, None
|
|
|
|
|
|
+ last_table_start, last_table_end = None, None
|
|
# print('h', h)
|
|
# print('h', h)
|
|
match = re.finditer('<table', h)
|
|
match = re.finditer('<table', h)
|
|
for m in match:
|
|
for m in match:
|
|
- table_start1 = m.span()[0]
|
|
|
|
- if table_start1 is not None:
|
|
|
|
- match = re.finditer('</table>', h[table_start1:])
|
|
|
|
|
|
+ last_table_start = m.span()[0]
|
|
|
|
+ if last_table_start is not None:
|
|
|
|
+ match = re.finditer('</table>', h[last_table_start:])
|
|
for m in match:
|
|
for m in match:
|
|
- table_end1 = m.span()[1] + table_start1
|
|
|
|
|
|
+ last_table_end = m.span()[1] + last_table_start
|
|
# 最后一个表格后有无除了页码外的内容
|
|
# 最后一个表格后有无除了页码外的内容
|
|
connect_flag1 = False
|
|
connect_flag1 = False
|
|
- if table_end1 is not None:
|
|
|
|
- match = re.search('[^-/第页0-9]*', re.sub('<div>|</div>', '', h[table_end1:]))
|
|
|
|
|
|
+ if last_table_end is not None:
|
|
|
|
+ match = re.search('[^-/第页0-9]*', re.sub('<div>|</div>', '', h[last_table_end:]))
|
|
# print('match1', match.group())
|
|
# print('match1', match.group())
|
|
if not match or match.group() == '':
|
|
if not match or match.group() == '':
|
|
connect_flag1 = True
|
|
connect_flag1 = True
|
|
|
|
|
|
# 找第一个表格
|
|
# 找第一个表格
|
|
- table_start2, table_end2 = None, None
|
|
|
|
|
|
+ first_table_start, first_table_end = None, None
|
|
match = re.finditer('<table', h)
|
|
match = re.finditer('<table', h)
|
|
for m in match:
|
|
for m in match:
|
|
- table_start2 = m.span()[0]
|
|
|
|
|
|
+ first_table_start = m.span()[0]
|
|
break
|
|
break
|
|
# 第一个表格后有无内容
|
|
# 第一个表格后有无内容
|
|
connect_flag2 = False
|
|
connect_flag2 = False
|
|
- if table_start2 is not None and table_start2 == 0:
|
|
|
|
|
|
+ if first_table_start is not None and first_table_start == 0:
|
|
connect_flag2 = True
|
|
connect_flag2 = True
|
|
|
|
+ # 有内容但是是页眉
|
|
|
|
+ if not connect_flag2:
|
|
|
|
+ tables = soup.findAll('table')
|
|
|
|
+ if tables:
|
|
|
|
+ first_table = tables[0]
|
|
|
|
+ rows = first_table.findAll('tr')
|
|
|
|
+ if rows:
|
|
|
|
+ first_row = rows[0]
|
|
|
|
+ col_text_list = [len(x.text) for x in first_row]
|
|
|
|
+ if len(h[:first_table_start]) <= 60 and col_text_list[0] == 0 and max(col_text_list) >= 30:
|
|
|
|
+ connect_flag2 = True
|
|
|
|
+
|
|
connect_flag_list.append([i, connect_flag2, connect_flag1])
|
|
connect_flag_list.append([i, connect_flag2, connect_flag1])
|
|
|
|
|
|
- # print('connect_flag_list', connect_flag_list)
|
|
|
|
|
|
+ print('connect_flag_list', connect_flag_list)
|
|
|
|
|
|
# 根据条件1合并需连接页码,形成组
|
|
# 根据条件1合并需连接页码,形成组
|
|
connect_pages_list = []
|
|
connect_pages_list = []
|
|
@@ -1595,9 +1613,9 @@ class PDFConvert:
|
|
if temp_list:
|
|
if temp_list:
|
|
connect_pages_list.append(temp_list)
|
|
connect_pages_list.append(temp_list)
|
|
|
|
|
|
- # print('connect_pages_list', connect_pages_list)
|
|
|
|
|
|
+ print('connect_pages_list', connect_pages_list)
|
|
|
|
|
|
- # 判断条件2:判断组内列数是否相同
|
|
|
|
|
|
+ # 判断后续条件:判断组内列数是否相同
|
|
connect_pages_list2 = []
|
|
connect_pages_list2 = []
|
|
for c_list in connect_pages_list:
|
|
for c_list in connect_pages_list:
|
|
if len(c_list) == 1:
|
|
if len(c_list) == 1:
|
|
@@ -1624,7 +1642,7 @@ class PDFConvert:
|
|
if new_c_list:
|
|
if new_c_list:
|
|
connect_pages_list2.append(new_c_list)
|
|
connect_pages_list2.append(new_c_list)
|
|
|
|
|
|
- # print('connect_pages_list2', connect_pages_list2)
|
|
|
|
|
|
+ print('connect_pages_list2', connect_pages_list2)
|
|
|
|
|
|
# 符合连接条件的拼接表格
|
|
# 符合连接条件的拼接表格
|
|
new_html_list = []
|
|
new_html_list = []
|
|
@@ -1634,9 +1652,14 @@ class PDFConvert:
|
|
continue
|
|
continue
|
|
new_html = ''
|
|
new_html = ''
|
|
for c in c_list:
|
|
for c in c_list:
|
|
- new_html += html_list[c[0]]
|
|
|
|
- new_html = re.sub('</table>([-/第页0-9]|<div>|</div>)*<table border="1">', '<tr><td>#@#@#</td></tr>',
|
|
|
|
|
|
+ # 加#@#@#防止替换错表格
|
|
|
|
+ new_html += html_list[c[0]] + '#@#@#'
|
|
|
|
+ new_html = new_html[:-5]
|
|
|
|
+ # ([-/第页0-9]|<div>|</div>)*
|
|
|
|
+ new_html = re.sub('</table>((<div>[-/第页0-9]*</div>#@#@#)|(#@#@#<div>[^<]*</div>)|#@#@#)<table border="1">',
|
|
|
|
+ '<tr><td>#@#@#</td></tr>',
|
|
new_html)
|
|
new_html)
|
|
|
|
+ print('new_html', new_html)
|
|
|
|
|
|
soup = BeautifulSoup(new_html, 'lxml')
|
|
soup = BeautifulSoup(new_html, 'lxml')
|
|
trs = soup.findAll('tr')
|
|
trs = soup.findAll('tr')
|