2 năm trước cách đây · f902f03f77
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -822,7 +822,7 @@ class PDFConvert:
 
				         log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
			
 
				         return lt_line_list
			
 
				 
			
 
				-    def get_page_lines(self, layout, page_no):
			
 
				+    def get_page_lines(self, layout, page_no, show=0):
			
 
				         def _plot(_line_list, mode=1):
			
 
				             for _line in _line_list:
			
 
				                 if mode == 1:
			
@@ -1180,10 +1180,12 @@ class PDFConvert:
 
				 
			
 
				         # 删除最外层嵌套边框
			
 
				         cross_line_list = remove_outline_no_cross(cross_line_list)
			
 
				+
			
 
				         # show
			
 
				-        # print('len(cross_line_list)', len(cross_line_list))
			
 
				-        # _plot(line_list, mode=2)
			
 
				-        # _plot(cross_line_list, mode=2)
			
 
				+        if show:
			
 
				+            print('len(cross_line_list)', len(cross_line_list))
			
 
				+            # _plot(line_list, mode=2)
			
 
				+            _plot(cross_line_list, mode=2)
			
 
				 
			
 
				         lt_line_list = []
			
 
				         for line in cross_line_list:
			
@@ -1193,7 +1195,7 @@ class PDFConvert:
 
				         return lt_line_list
			
 
				 
			
 
				     def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
			
 
				-        list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list)
			
 
				+        list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list, from_pdf=True)
			
 
				         self._page.in_table_objs = filter_objs
			
 
				 
			
 
				         # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
			
@@ -1542,42 +1544,58 @@ class PDFConvert:
 
				         if not html_list:
			
 
				             return html_list
			
 
				 
			
 
				-        # 判断条件1：最后一个表格后有无非页码文本/第一个表格前有无文本
			
 
				+        # 判断初始条件1
			
 
				+        # 0: 前一页最后一个表格为A，后一页第一个表格为B
			
 
				+        # 1.1: A后无文本(除了页码)，且B前无文本(除了页码)
			
 
				+        # 1.2: B前有文字(可能是页眉，小于60字)，且B的第一行前几个单元格为空，且第一行不为空的单元格有文字较多的格子
			
 
				         connect_flag_list = []
			
 
				         soup_list = []
			
 
				         for i, h in enumerate(html_list):
			
 
				-            soup_list.append(BeautifulSoup(h, 'lxml'))
			
 
				+            soup = BeautifulSoup(h, 'lxml')
			
 
				+            soup_list.append(soup)
			
 
				             # 找最后一个表格
			
 
				-            table_start1, table_end1 = None, None
			
 
				+            last_table_start, last_table_end = None, None
			
 
				             # print('h', h)
			
 
				             match = re.finditer('<table', h)
			
 
				             for m in match:
			
 
				-                table_start1 = m.span()[0]
			
 
				-            if table_start1 is not None:
			
 
				-                match = re.finditer('</table>', h[table_start1:])
			
 
				+                last_table_start = m.span()[0]
			
 
				+            if last_table_start is not None:
			
 
				+                match = re.finditer('</table>', h[last_table_start:])
			
 
				                 for m in match:
			
 
				-                    table_end1 = m.span()[1] + table_start1
			
 
				+                    last_table_end = m.span()[1] + last_table_start
			
 
				             # 最后一个表格后有无除了页码外的内容
			
 
				             connect_flag1 = False
			
 
				-            if table_end1 is not None:
			
 
				-                match = re.search('[^-/第页0-9]*', re.sub('<div>|</div>', '', h[table_end1:]))
			
 
				+            if last_table_end is not None:
			
 
				+                match = re.search('[^-/第页0-9]*', re.sub('<div>|</div>', '', h[last_table_end:]))
			
 
				                 # print('match1', match.group())
			
 
				                 if not match or match.group() == '':
			
 
				                     connect_flag1 = True
			
 
				 
			
 
				             # 找第一个表格
			
 
				-            table_start2, table_end2 = None, None
			
 
				+            first_table_start, first_table_end = None, None
			
 
				             match = re.finditer('<table', h)
			
 
				             for m in match:
			
 
				-                table_start2 = m.span()[0]
			
 
				+                first_table_start = m.span()[0]
			
 
				                 break
			
 
				             # 第一个表格后有无内容
			
 
				             connect_flag2 = False
			
 
				-            if table_start2 is not None and table_start2 == 0:
			
 
				+            if first_table_start is not None and first_table_start == 0:
			
 
				                 connect_flag2 = True
			
 
				+            # 有内容但是是页眉
			
 
				+            if not connect_flag2:
			
 
				+                tables = soup.findAll('table')
			
 
				+                if tables:
			
 
				+                    first_table = tables[0]
			
 
				+                    rows = first_table.findAll('tr')
			
 
				+                    if rows:
			
 
				+                        first_row = rows[0]
			
 
				+                        col_text_list = [len(x.text) for x in first_row]
			
 
				+                        if len(h[:first_table_start]) <= 60 and col_text_list[0] == 0 and max(col_text_list) >= 30:
			
 
				+                            connect_flag2 = True
			
 
				+
			
 
				             connect_flag_list.append([i, connect_flag2, connect_flag1])
			
 
				 
			
 
				-        # print('connect_flag_list', connect_flag_list)
			
 
				+        print('connect_flag_list', connect_flag_list)
			
 
				 
			
 
				         # 根据条件1合并需连接页码，形成组
			
 
				         connect_pages_list = []
			
@@ -1595,9 +1613,9 @@ class PDFConvert:
 
				         if temp_list:
			
 
				             connect_pages_list.append(temp_list)
			
 
				 
			
 
				-        # print('connect_pages_list', connect_pages_list)
			
 
				+        print('connect_pages_list', connect_pages_list)
			
 
				 
			
 
				-        # 判断条件2：判断组内列数是否相同
			
 
				+        # 判断后续条件：判断组内列数是否相同
			
 
				         connect_pages_list2 = []
			
 
				         for c_list in connect_pages_list:
			
 
				             if len(c_list) == 1:
			
@@ -1624,7 +1642,7 @@ class PDFConvert:
 
				                 if new_c_list:
			
 
				                     connect_pages_list2.append(new_c_list)
			
 
				 
			
 
				-        # print('connect_pages_list2', connect_pages_list2)
			
 
				+        print('connect_pages_list2', connect_pages_list2)
			
 
				 
			
 
				         # 符合连接条件的拼接表格
			
 
				         new_html_list = []
			
@@ -1634,9 +1652,14 @@ class PDFConvert:
 
				                 continue
			
 
				             new_html = ''
			
 
				             for c in c_list:
			
 
				-                new_html += html_list[c[0]]
			
 
				-            new_html = re.sub('</table>([-/第页0-9]|<div>|</div>)*<table border="1">', '<tr><td>#@#@#</td></tr>',
			
 
				+                # 加#@#@#防止替换错表格
			
 
				+                new_html += html_list[c[0]] + '#@#@#'
			
 
				+            new_html = new_html[:-5]
			
 
				+            # ([-/第页0-9]|<div>|</div>)*
			
 
				+            new_html = re.sub('</table>((<div>[-/第页0-9]*</div>#@#@#)|(#@#@#<div>[^<]*</div>)|#@#@#)<table border="1">',
			
 
				+                              '<tr><td>#@#@#</td></tr>',
			
 
				                               new_html)
			
 
				+            print('new_html', new_html)
			
 
				 
			
 
				             soup = BeautifulSoup(new_html, 'lxml')
			
 
				             trs = soup.findAll('tr')