Browse Source

新增表格连接规则

fangjiasheng 1 năm trước cách đây
mục cha
commit
f902f03f77
1 tập tin đã thay đổi với 46 bổ sung23 xóa
  1. 46 23
      format_convert/convert_pdf.py

+ 46 - 23
format_convert/convert_pdf.py

@@ -822,7 +822,7 @@ class PDFConvert:
         log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
         return lt_line_list
 
-    def get_page_lines(self, layout, page_no):
+    def get_page_lines(self, layout, page_no, show=0):
         def _plot(_line_list, mode=1):
             for _line in _line_list:
                 if mode == 1:
@@ -1180,10 +1180,12 @@ class PDFConvert:
 
         # 删除最外层嵌套边框
         cross_line_list = remove_outline_no_cross(cross_line_list)
+
         # show
-        # print('len(cross_line_list)', len(cross_line_list))
-        # _plot(line_list, mode=2)
-        # _plot(cross_line_list, mode=2)
+        if show:
+            print('len(cross_line_list)', len(cross_line_list))
+            # _plot(line_list, mode=2)
+            _plot(cross_line_list, mode=2)
 
         lt_line_list = []
         for line in cross_line_list:
@@ -1193,7 +1195,7 @@ class PDFConvert:
         return lt_line_list
 
     def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
-        list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list)
+        list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list, from_pdf=True)
         self._page.in_table_objs = filter_objs
 
         # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
@@ -1542,42 +1544,58 @@ class PDFConvert:
         if not html_list:
             return html_list
 
-        # 判断条件1:最后一个表格后有无非页码文本/第一个表格前有无文本
+        # 判断初始条件1
+        # 0: 前一页最后一个表格为A,后一页第一个表格为B
+        # 1.1: A后无文本(除了页码),且B前无文本(除了页码)
+        # 1.2: B前有文字(可能是页眉,小于60字),且B的第一行前几个单元格为空,且第一行不为空的单元格有文字较多的格子
         connect_flag_list = []
         soup_list = []
         for i, h in enumerate(html_list):
-            soup_list.append(BeautifulSoup(h, 'lxml'))
+            soup = BeautifulSoup(h, 'lxml')
+            soup_list.append(soup)
             # 找最后一个表格
-            table_start1, table_end1 = None, None
+            last_table_start, last_table_end = None, None
             # print('h', h)
             match = re.finditer('<table', h)
             for m in match:
-                table_start1 = m.span()[0]
-            if table_start1 is not None:
-                match = re.finditer('</table>', h[table_start1:])
+                last_table_start = m.span()[0]
+            if last_table_start is not None:
+                match = re.finditer('</table>', h[last_table_start:])
                 for m in match:
-                    table_end1 = m.span()[1] + table_start1
+                    last_table_end = m.span()[1] + last_table_start
             # 最后一个表格后有无除了页码外的内容
             connect_flag1 = False
-            if table_end1 is not None:
-                match = re.search('[^-/第页0-9]*', re.sub('<div>|</div>', '', h[table_end1:]))
+            if last_table_end is not None:
+                match = re.search('[^-/第页0-9]*', re.sub('<div>|</div>', '', h[last_table_end:]))
                 # print('match1', match.group())
                 if not match or match.group() == '':
                     connect_flag1 = True
 
             # 找第一个表格
-            table_start2, table_end2 = None, None
+            first_table_start, first_table_end = None, None
             match = re.finditer('<table', h)
             for m in match:
-                table_start2 = m.span()[0]
+                first_table_start = m.span()[0]
                 break
             # 第一个表格后有无内容
             connect_flag2 = False
-            if table_start2 is not None and table_start2 == 0:
+            if first_table_start is not None and first_table_start == 0:
                 connect_flag2 = True
+            # 有内容但是是页眉
+            if not connect_flag2:
+                tables = soup.findAll('table')
+                if tables:
+                    first_table = tables[0]
+                    rows = first_table.findAll('tr')
+                    if rows:
+                        first_row = rows[0]
+                        col_text_list = [len(x.text) for x in first_row]
+                        if len(h[:first_table_start]) <= 60 and col_text_list[0] == 0 and max(col_text_list) >= 30:
+                            connect_flag2 = True
+
             connect_flag_list.append([i, connect_flag2, connect_flag1])
 
-        # print('connect_flag_list', connect_flag_list)
+        print('connect_flag_list', connect_flag_list)
 
         # 根据条件1合并需连接页码,形成组
         connect_pages_list = []
@@ -1595,9 +1613,9 @@ class PDFConvert:
         if temp_list:
             connect_pages_list.append(temp_list)
 
-        # print('connect_pages_list', connect_pages_list)
+        print('connect_pages_list', connect_pages_list)
 
-        # 判断条件2:判断组内列数是否相同
+        # 判断后续条件:判断组内列数是否相同
         connect_pages_list2 = []
         for c_list in connect_pages_list:
             if len(c_list) == 1:
@@ -1624,7 +1642,7 @@ class PDFConvert:
                 if new_c_list:
                     connect_pages_list2.append(new_c_list)
 
-        # print('connect_pages_list2', connect_pages_list2)
+        print('connect_pages_list2', connect_pages_list2)
 
         # 符合连接条件的拼接表格
         new_html_list = []
@@ -1634,9 +1652,14 @@ class PDFConvert:
                 continue
             new_html = ''
             for c in c_list:
-                new_html += html_list[c[0]]
-            new_html = re.sub('</table>([-/第页0-9]|<div>|</div>)*<table border="1">', '<tr><td>#@#@#</td></tr>',
+                # 加#@#@#防止替换错表格
+                new_html += html_list[c[0]] + '#@#@#'
+            new_html = new_html[:-5]
+            # ([-/第页0-9]|<div>|</div>)*
+            new_html = re.sub('</table>((<div>[-/第页0-9]*</div>#@#@#)|(#@#@#<div>[^<]*</div>)|#@#@#)<table border="1">',
+                              '<tr><td>#@#@#</td></tr>',
                               new_html)
+            print('new_html', new_html)
 
             soup = BeautifulSoup(new_html, 'lxml')
             trs = soup.findAll('tr')