2 rokov pred · f902f03f77
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -822,7 +822,7 @@ class PDFConvert:
 
															         log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
														
 
															         return lt_line_list
														
 
															-    def get_page_lines(self, layout, page_no):
														
 
															+    def get_page_lines(self, layout, page_no, show=0):
														
 
															         def _plot(_line_list, mode=1):
														
 
															             for _line in _line_list:
														
 
															                 if mode == 1:
														
@@ -1180,10 +1180,12 @@ class PDFConvert:
 
															         # 删除最外层嵌套边框
														
 
															         cross_line_list = remove_outline_no_cross(cross_line_list)
														
 
															+
														
 
															         # show
														
 
															-        # print('len(cross_line_list)', len(cross_line_list))
														
 
															-        # _plot(line_list, mode=2)
														
 
															-        # _plot(cross_line_list, mode=2)
														
 
															+        if show:
														
 
															+            print('len(cross_line_list)', len(cross_line_list))
														
 
															+            # _plot(line_list, mode=2)
														
 
															+            _plot(cross_line_list, mode=2)
														
 
															         lt_line_list = []
														
 
															         for line in cross_line_list:
														
@@ -1193,7 +1195,7 @@ class PDFConvert:
 
															         return lt_line_list
														
 
															     def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
														
 
															-        list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list)
														
 
															+        list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list, from_pdf=True)
														
 
															         self._page.in_table_objs = filter_objs
														
 
															         # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
														
@@ -1542,42 +1544,58 @@ class PDFConvert:
 
															         if not html_list:
														
 
															             return html_list
														
 
															-        # 判断条件1：最后一个表格后有无非页码文本/第一个表格前有无文本
														
 
															+        # 判断初始条件1
														
 
															+        # 0: 前一页最后一个表格为A，后一页第一个表格为B
														
 
															+        # 1.1: A后无文本(除了页码)，且B前无文本(除了页码)
														
 
															+        # 1.2: B前有文字(可能是页眉，小于60字)，且B的第一行前几个单元格为空，且第一行不为空的单元格有文字较多的格子
														
 
															         connect_flag_list = []
														
 
															         soup_list = []
														
 
															         for i, h in enumerate(html_list):
														
 
															-            soup_list.append(BeautifulSoup(h, 'lxml'))
														
 
															+            soup = BeautifulSoup(h, 'lxml')
														
 
															+            soup_list.append(soup)
														
 
															             # 找最后一个表格
														
 
															-            table_start1, table_end1 = None, None
														
 
															+            last_table_start, last_table_end = None, None
														
 
															             # print('h', h)
														
 
															             match = re.finditer('<table', h)
														
 
															             for m in match:
														
 
															-                table_start1 = m.span()[0]
														
 
															-            if table_start1 is not None:
														
 
															-                match = re.finditer('</table>', h[table_start1:])
														
 
															+                last_table_start = m.span()[0]
														
 
															+            if last_table_start is not None:
														
 
															+                match = re.finditer('</table>', h[last_table_start:])
														
 
															                 for m in match:
														
 
															-                    table_end1 = m.span()[1] + table_start1
														
 
															+                    last_table_end = m.span()[1] + last_table_start
														
 
															             # 最后一个表格后有无除了页码外的内容
														
 
															             connect_flag1 = False
														
 
															-            if table_end1 is not None:
														
 
															-                match = re.search('[^-/第页0-9]*', re.sub('<div>|</div>', '', h[table_end1:]))
														
 
															+            if last_table_end is not None:
														
 
															+                match = re.search('[^-/第页0-9]*', re.sub('<div>|</div>', '', h[last_table_end:]))
														
 
															                 # print('match1', match.group())
														
 
															                 if not match or match.group() == '':
														
 
															                     connect_flag1 = True
														
 
															             # 找第一个表格
														
 
															-            table_start2, table_end2 = None, None
														
 
															+            first_table_start, first_table_end = None, None
														
 
															             match = re.finditer('<table', h)
														
 
															             for m in match:
														
 
															-                table_start2 = m.span()[0]
														
 
															+                first_table_start = m.span()[0]
														
 
															                 break
														
 
															             # 第一个表格后有无内容
														
 
															             connect_flag2 = False
														
 
															-            if table_start2 is not None and table_start2 == 0:
														
 
															+            if first_table_start is not None and first_table_start == 0:
														
 
															                 connect_flag2 = True
														
 
															+            # 有内容但是是页眉
														
 
															+            if not connect_flag2:
														
 
															+                tables = soup.findAll('table')
														
 
															+                if tables:
														
 
															+                    first_table = tables[0]
														
 
															+                    rows = first_table.findAll('tr')
														
 
															+                    if rows:
														
 
															+                        first_row = rows[0]
														
 
															+                        col_text_list = [len(x.text) for x in first_row]
														
 
															+                        if len(h[:first_table_start]) <= 60 and col_text_list[0] == 0 and max(col_text_list) >= 30:
														
 
															+                            connect_flag2 = True
														
 
															+
														
 
															             connect_flag_list.append([i, connect_flag2, connect_flag1])
														
 
															-        # print('connect_flag_list', connect_flag_list)
														
 
															+        print('connect_flag_list', connect_flag_list)
														
 
															         # 根据条件1合并需连接页码，形成组
														
 
															         connect_pages_list = []
														
@@ -1595,9 +1613,9 @@ class PDFConvert:
 
															         if temp_list:
														
 
															             connect_pages_list.append(temp_list)
														
 
															-        # print('connect_pages_list', connect_pages_list)
														
 
															+        print('connect_pages_list', connect_pages_list)
														
 
															-        # 判断条件2：判断组内列数是否相同
														
 
															+        # 判断后续条件：判断组内列数是否相同
														
 
															         connect_pages_list2 = []
														
 
															         for c_list in connect_pages_list:
														
 
															             if len(c_list) == 1:
														
@@ -1624,7 +1642,7 @@ class PDFConvert:
 
															                 if new_c_list:
														
 
															                     connect_pages_list2.append(new_c_list)
														
 
															-        # print('connect_pages_list2', connect_pages_list2)
														
 
															+        print('connect_pages_list2', connect_pages_list2)
														
 
															         # 符合连接条件的拼接表格
														
 
															         new_html_list = []
														
@@ -1634,9 +1652,14 @@ class PDFConvert:
 
															                 continue
														
 
															             new_html = ''
														
 
															             for c in c_list:
														
 
															-                new_html += html_list[c[0]]
														
 
															-            new_html = re.sub('</table>([-/第页0-9]|<div>|</div>)*<table border="1">', '<tr><td>#@#@#</td></tr>',
														
 
															+                # 加#@#@#防止替换错表格
														
 
															+                new_html += html_list[c[0]] + '#@#@#'
														
 
															+            new_html = new_html[:-5]
														
 
															+            # ([-/第页0-9]|<div>|</div>)*
														
 
															+            new_html = re.sub('</table>((<div>[-/第页0-9]*</div>#@#@#)|(#@#@#<div>[^<]*</div>)|#@#@#)<table border="1">',
														
 
															+                              '<tr><td>#@#@#</td></tr>',
														
 
															                               new_html)
														
 
															+            print('new_html', new_html)
														
 
															             soup = BeautifulSoup(new_html, 'lxml')
														
 
															             trs = soup.findAll('tr')