2 lat temu · 932a19a6e1
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -426,7 +426,7 @@ class PDFConvert:
 
				 
			
 
				     def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
			
 
				         list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list,
			
 
				-                                                                                    from_pdf=True, is_reverse=True)
			
 
				+                                                                                    from_pdf=True, is_reverse=False)
			
 
				         self._page.in_table_objs = filter_objs
			
 
				 
			
 
				         # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
			
@@ -444,7 +444,7 @@ class PDFConvert:
 
				             _sen = _Sentence(sentence.text, sentence.bbox)
			
 
				             self._page.add_child(_sen)
			
 
				         # pdf对象需反向排序
			
 
				-        self._page.is_reverse = True
			
 
				+        # self._page.is_reverse = True
			
 
				 
			
 
				         return list_tables
			
 
				 
			
@@ -577,6 +577,23 @@ class PDFConvert:
 
				             self._page.error_code = layout
			
 
				             return
			
 
				 
			
 
				+        # 翻转pdf中所有对象的y坐标
			
 
				+        max_y, min_y = 0, 10000
			
 
				+        for x in layout:
			
 
				+            min_y = min(min_y, x.y0, x.y1)
			
 
				+            max_y = max(max_y, x.y0, x.y1)
			
 
				+        if max_y == 0:
			
 
				+            return
			
 
				+        for x in layout:
			
 
				+            # 外层obj的bbox设置
			
 
				+            x.set_bbox((x.x0, round(max_y - max(x.y0, x.y1), 1), x.x1, round(max_y - min(x.y0, x.y1), 1)))
			
 
				+            # 内层单个字符的bbox设置
			
 
				+            if isinstance(x, (LTTextBoxHorizontal, LTTextBoxVertical)):
			
 
				+                for lt_text_line in x:
			
 
				+                    for lt_char in lt_text_line:
			
 
				+                        if isinstance(lt_char, LTChar):
			
 
				+                            lt_char.set_bbox((lt_char.x0, round(max_y - max(lt_char.y0, lt_char.y1), 1), lt_char.x1, round(max_y - min(lt_char.y0, lt_char.y1), 1)))
			
 
				+
			
 
				         # 判断该页的对象类型，并存储
			
 
				         lt_text_list = []
			
 
				         lt_image_list = []
			
@@ -675,7 +692,8 @@ class PDFConvert:
 
				                     traceback.print_exc()
			
 
				 
			
 
				             # pdf对象需反向排序
			
 
				-            self._page.is_reverse = True
			
 
				+            # self._page.is_reverse = True
			
 
				+
			
 
				             if self.has_init_pdf[3] == 0:
			
 
				                 self.init_package("pdfplumber")
			
 
				 
			
@@ -699,7 +717,7 @@ class PDFConvert:
 
				                 else:
			
 
				                     _image = _Image(page_image[1], page_image[0])
			
 
				                     _image.is_from_pdf = True
			
 
				-                    _image.is_reverse = True
			
 
				+                    # _image.is_reverse = True
			
 
				                     _image.b_table_from_text = True
			
 
				                     _image.b_table_text_obj_list = lt_text_list
			
 
				                     _image.b_table_layout_size = (layout.width, layout.height)
			
@@ -794,7 +812,7 @@ class PDFConvert:
 
				             self._doc.add_child(self._page)
			
 
				         log('get_all_page_image cost: ' + str(time.time()-start_time))
			
 
				 
			
 
				-    def connect_table(self, html_list):
			
 
				+    def connect_table(self, html_list, show=0):
			
 
				         if not html_list:
			
 
				             return html_list
			
 
				 
			
@@ -807,6 +825,7 @@ class PDFConvert:
 
				         # 1.5: A后有文字(除了页码还有页眉)，且A的后面只有一行且中文不超过15个字
			
 
				         connect_flag_list = []
			
 
				         soup_list = []
			
 
				+        connect_rule_dict = {}
			
 
				         for i, h in enumerate(html_list):
			
 
				             soup = BeautifulSoup(h, 'lxml')
			
 
				             soup_list.append(soup)
			
@@ -819,6 +838,10 @@ class PDFConvert:
 
				                 match = re.finditer('</table>', h[last_table_start:])
			
 
				                 for m in match:
			
 
				                     last_table_end = m.span()[1] + last_table_start
			
 
				+
			
 
				+            # 补充规则，把表格也带上
			
 
				+            rule_a = [0, h[last_table_start:last_table_end]]
			
 
				+
			
 
				             # 最后一个表格后有无除了页码外的内容
			
 
				             connect_flag1 = False
			
 
				             if last_table_end is not None:
			
@@ -840,6 +863,14 @@ class PDFConvert:
 
				             for m in match:
			
 
				                 first_table_start = m.span()[0]
			
 
				                 break
			
 
				+            if first_table_start is not None:
			
 
				+                match = re.finditer('</table>', h[first_table_start:])
			
 
				+                for m in match:
			
 
				+                    first_table_end = m.span()[1] + first_table_start
			
 
				+
			
 
				+            # 补充规则，把表格也带上
			
 
				+            rule_b = [0, h[first_table_start:first_table_end]]
			
 
				+
			
 
				             # 第一个表格前有无内容
			
 
				             connect_flag2 = False
			
 
				             if first_table_start is not None and first_table_start == 0:
			
@@ -858,6 +889,7 @@ class PDFConvert:
 
				                         # 文字大于60且第一个为空
			
 
				                         if not connect_flag2 and len(h[:first_table_start]) <= 60 and col_text_len_list[0] == 0 and max(col_text_len_list) >= 30:
			
 
				                             connect_flag2 = True
			
 
				+                            rule_b[0] = 1
			
 
				                         # 有文字格子数占一半一下且第一个格子为空
			
 
				                         if not connect_flag2 and col_text_len_list.count(0) >= len(col_text_len_list) / 2 and col_text_len_list[0] == 0:
			
 
				                             connect_flag2 = True
			
@@ -870,8 +902,11 @@ class PDFConvert:
 
				                         #     connect_flag2 = True
			
 
				 
			
 
				             connect_flag_list.append([i, connect_flag2, connect_flag1])
			
 
				+            connect_rule_dict[i] = [rule_b, rule_a]
			
 
				 
			
 
				-        print('connect_flag_list', connect_flag_list)
			
 
				+        if show:
			
 
				+            print('connect_flag_list', connect_flag_list)
			
 
				+            print('connect_rule_dict', connect_rule_dict)
			
 
				 
			
 
				         # 根据条件1合并需连接页码，形成组
			
 
				         connect_pages_list = []
			
@@ -889,7 +924,8 @@ class PDFConvert:
 
				             if temp_list:
			
 
				                 connect_pages_list.append(temp_list)
			
 
				 
			
 
				-        print('connect_pages_list', connect_pages_list)
			
 
				+        if show:
			
 
				+            print('connect_pages_list', connect_pages_list)
			
 
				 
			
 
				         # 判断后续条件：判断组内列数是否相同
			
 
				         connect_pages_list2 = []
			
@@ -942,7 +978,55 @@ class PDFConvert:
 
				                 if new_c_list:
			
 
				                     connect_pages_list2.append(new_c_list)
			
 
				 
			
 
				-        print('connect_pages_list2', connect_pages_list2)
			
 
				+        if show:
			
 
				+            print('connect_pages_list2', connect_pages_list2)
			
 
				+
			
 
				+        # 判断连接的两个表格是否需要补单元格内容
			
 
				+        for c_list in connect_pages_list2:
			
 
				+            for i in range(len(c_list)-1):
			
 
				+                page_index1 = c_list[i][0]
			
 
				+                page_index2 = c_list[i+1][0]
			
 
				+                html2 = html_list[page_index2]
			
 
				+                soup2 = soup_list[page_index2]
			
 
				+                rule1 = connect_rule_dict.get(page_index1)[1]
			
 
				+                rule2 = connect_rule_dict.get(page_index2)[0]
			
 
				+                # print('rule1', rule1)
			
 
				+                # if rule2[0]:
			
 
				+                table1 = BeautifulSoup(rule1[1], 'lxml').findAll('table')[0]
			
 
				+                table2 = BeautifulSoup(rule2[1], 'lxml').findAll('table')[0]
			
 
				+                add_td_value = []
			
 
				+                # 获取最后一行td
			
 
				+                for tr in table1.findAll('tr')[::-1]:
			
 
				+                    temp_list = []
			
 
				+                    for td in tr.findAll('td'):
			
 
				+                        temp_list.append(td.get_text())
			
 
				+                    add_td_value = temp_list
			
 
				+                    break
			
 
				+                # print('add_td_value', add_td_value)
			
 
				+                tr_index = 0
			
 
				+                for tr in table2.findAll('tr'):
			
 
				+                    temp_list = []
			
 
				+                    for td in tr.findAll('td'):
			
 
				+                        if len(td.get_text()) < 1:
			
 
				+                            temp_list.append(0)
			
 
				+                        else:
			
 
				+                            temp_list.append(1)
			
 
				+                    # print('temp_list', temp_list)
			
 
				+                    if temp_list and add_td_value and len(temp_list) == len(add_td_value) \
			
 
				+                            and 1 in temp_list and temp_list[0] != 1 \
			
 
				+                            and 1 not in temp_list[:temp_list.index(1)]:
			
 
				+                        for j in range(len(temp_list)):
			
 
				+                            if temp_list[j] == 0:
			
 
				+                                tr.findAll('td')[j].string = add_td_value[j]
			
 
				+                            # else:
			
 
				+                            #     # 只有第一行，且列数大于3，且只有一列有值情况下，上下两行文本合并
			
 
				+                            #     if tr_index == 0 and len(temp_list) >= 3 and temp_list.count(1) == 1:
			
 
				+                            #         tr.findAll('td')[j].string += add_td_value[j]
			
 
				+                        # print('tr.findAll(td)[0]', tr.findAll('td')[0])
			
 
				+                    tr_index += 1
			
 
				+
			
 
				+                soup2.findAll('table')[0].replace_with(table2)
			
 
				+                html_list[page_index2] = str(soup2)
			
 
				 
			
 
				         # 符合连接条件的拼接表格
			
 
				         new_html_list = []
			
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
@@ -355,7 +355,7 @@ def slash_replace(_str, reverse=False):
 
				 
			
 
				 
			
 
				 class LineTable:
			
 
				-    def recognize_table(self, list_textbox, list_line, sourceP_LB=True,
			
 
				+    def recognize_table(self, list_textbox, list_line, sourceP_LB=False,
			
 
				                         splited=False, from_pdf=False, is_reverse=False, show=0):
			
 
				         self.list_line = list_line
			
 
				         self.list_crosspoints = self.recognize_crosspoints(list_line)
			
@@ -938,24 +938,24 @@ class LineTable:
 
				             if len(_line) > 0:
			
 
				                 _bbox = _line[0].get("bbox")
			
 
				                 # check if has lap
			
 
				-                if (min(_bbox[1], _bbox[3]) > max(bbox[1], bbox[3]) or max(_bbox[1], _bbox[3]) < min(bbox[1], bbox[3])):
			
 
				+                if min(_bbox[1], _bbox[3]) > max(bbox[1], bbox[3]) or max(_bbox[1], _bbox[3]) < min(bbox[1], bbox[3]):
			
 
				                     # if abs(min(_bbox[1],_bbox[3])-min(bbox[1],bbox[3]))>margin or abs(max(_bbox[1],_bbox[3])-max(bbox[1],bbox[3]))>margin:
			
 
				                     #     print(_bbox)
			
 
				                     #     print(bbox)
			
 
				-                    # print("check position y false")
			
 
				+                    print("check position y false", _bbox, bbox)
			
 
				                     return False
			
 
				             # check x
			
 
				             if _position <= len(_line) - 1:
			
 
				                 after_bbox = _line[_position].get("bbox")
			
 
				                 # the insert bbox.x1 should not less then the after bbox.x0
			
 
				                 if not (after_bbox[0] >= bbox[2]):
			
 
				-                    # print("check position x after false")
			
 
				+                    # print("check position x after false 1")
			
 
				                     return False
			
 
				-            if _position - 1 > 0 and _position - 1 < len(_line):
			
 
				+            if 0 < _position - 1 < len(_line):
			
 
				                 before_bbox = _line[_position - 1].get("bbox")
			
 
				                 # the insert bbox.x1 should less equal than the first bbox.x0
			
 
				                 if not (bbox[0] >= before_bbox[2]):
			
 
				-                    # print("check position x before false")
			
 
				+                    # print("check position x before false 2")
			
 
				                     return False
			
 
				             return True
			
 
				 
			
@@ -994,22 +994,40 @@ class LineTable:
 
				             while c_i < len(_line):
			
 
				                 _cell = _line[c_i]
			
 
				                 if _cell.get("rowspan") > 1:
			
 
				+                    # print('_cell', _cell)
			
 
				                     x0, y0, x1, y1 = _cell.get("bbox")
			
 
				                     _rospan = _cell.get("rowspan")
			
 
				                     locations = self.getSpanLocation(list_y, y0, y1, 10)
			
 
				+                    # print('locations', locations)
			
 
				 
			
 
				                     if len(locations) == _rospan + 1:
			
 
				-                        _cell["bbox"] = (x0, y0, x1, locations[1])
			
 
				+                        if self.is_reverse:
			
 
				+                            _cell["bbox"] = (x0, locations[-2], x1, y0)
			
 
				+                        else:
			
 
				+                            _cell["bbox"] = (x0, y0, x1, locations[1])
			
 
				                         _cell["rowspan"] = 1
			
 
				 
			
 
				+                        # print('_cell1', _cell)
			
 
				+
			
 
				                         for i in range(1, _rospan):
			
 
				                             n_cell = {}
			
 
				                             n_cell.update(_cell)
			
 
				+                            # if not self.is_reverse:
			
 
				                             if l_i + i <= len(_table) - 1:
			
 
				                                 # print(len(_table),l_i+i)
			
 
				                                 n_cell["bbox"] = (x0, locations[i], x1, locations[i + 1])
			
 
				+                                # print('n_cell', n_cell)
			
 
				                                 if checkPosition(_table[l_i + i], c_i, n_cell["bbox"]):
			
 
				+                                    # print('n_cell1', n_cell)
			
 
				                                     _table[l_i + i].insert(c_i, n_cell)
			
 
				+                            # else:
			
 
				+                            #     if l_i - i >= 0:
			
 
				+                            #         # print(len(_table),l_i+i)
			
 
				+                            #         n_cell["bbox"] = (x0, locations[i], x1, locations[i + 1])
			
 
				+                            #         print('n_cell', n_cell)
			
 
				+                            #         if checkPosition(_table[l_i - i], c_i, n_cell["bbox"]):
			
 
				+                            #             print('n_cell1', n_cell)
			
 
				+                            #             _table[l_i - i].insert(c_i, n_cell)
			
 
				                 c_i += 1
			
 
				 
			
 
				     def fixRect(self, _table, list_x, list_y, sourceP_LB, margin):
			
@@ -1019,12 +1037,16 @@ class LineTable:
 
				         #         _cell = _table[line_i][cell_i]
			
 
				         #         print(line_i,cell_i,_cell["bbox"],_cell["text"])
			
 
				         for _line in _table:
			
 
				+            _line.sort(key=lambda x: x.get('bbox')[0])
			
 
				+            # print('_line', _line)
			
 
				             extend_line = []
			
 
				             for c_i in range(len(_line)):
			
 
				                 c_cell = _line[c_i]
			
 
				 
			
 
				                 # first cell missing
			
 
				                 if c_i == 0 and c_cell["bbox"][0] != list_x[0]:
			
 
				+                    # print('c_cell', c_cell)
			
 
				+                    # print('list_x', list_x)
			
 
				                     _bbox = (list_x[0], c_cell["bbox"][1], c_cell["bbox"][0], c_cell["bbox"][3])
			
 
				                     _cell = {"bbox": _bbox,
			
 
				                              "rect": LTRect(1, _bbox),
			
@@ -1103,7 +1125,8 @@ class LineTable:
 
				             # 分行，根据y重合
			
 
				             all_match_box_list = []
			
 
				 
			
 
				-            inbox_textbox_list.sort(key=lambda x: x.bbox[1], reverse=sourceP_LB)
			
 
				+            # inbox_textbox_list.sort(key=lambda x: x.bbox[1], reverse=sourceP_LB)
			
 
				+            inbox_textbox_list.sort(key=lambda x: x.bbox[1])
			
 
				             for i in range(len(inbox_textbox_list)):
			
 
				                 match_box_list = []
			
 
				                 box1 = inbox_textbox_list[i]
			
@@ -1132,7 +1155,8 @@ class LineTable:
 
				                 all_match_box_list.append(match_box_list)
			
 
				 
			
 
				             # print("match_box_list", all_match_box_list)
			
 
				-            all_match_box_list.sort(key=lambda x: (round(x[0][2] + x[0][4]) / 2, 0), reverse=sourceP_LB)
			
 
				+            # all_match_box_list.sort(key=lambda x: (round(x[0][2] + x[0][4]) / 2, 0), reverse=sourceP_LB)
			
 
				+            all_match_box_list.sort(key=lambda x: (round(x[0][2] + x[0][4]) / 2, 0))
			
 
				             for box_list in all_match_box_list:
			
 
				                 for box in box_list:
			
 
				                     _cell["text"] += re.sub("\s", '', box[0])
			
@@ -1148,32 +1172,32 @@ class LineTable:
 
				 
			
 
				         clusters_rects = []
			
 
				         # 根据y1聚类
			
 
				-        if sourceP_LB:
			
 
				-            list_rect.sort(key=lambda x: x.bbox[3])
			
 
				-            for _rect in list_rect:
			
 
				-                _y0 = _rect.bbox[3]
			
 
				-                _y1 = _rect.bbox[1]
			
 
				-                _find = False
			
 
				-                for l_cr in clusters_rects:
			
 
				-                    if abs(l_cr[0].bbox[3] - _y0) < margin:
			
 
				-                        _find = True
			
 
				-                        l_cr.append(_rect)
			
 
				-                        break
			
 
				-                if not _find:
			
 
				-                    clusters_rects.append([_rect])
			
 
				-        else:
			
 
				-            list_rect.sort(key=lambda x: x.bbox[1])
			
 
				-            for _rect in list_rect:
			
 
				-                _y0 = _rect.bbox[1]
			
 
				-                _y1 = _rect.bbox[3]
			
 
				-                _find = False
			
 
				-                for l_cr in clusters_rects:
			
 
				-                    if abs(l_cr[0].bbox[1] - _y0) < margin:
			
 
				-                        _find = True
			
 
				-                        l_cr.append(_rect)
			
 
				-                        break
			
 
				-                if not _find:
			
 
				-                    clusters_rects.append([_rect])
			
 
				+        # if sourceP_LB:
			
 
				+        #     list_rect.sort(key=lambda x: x.bbox[3])
			
 
				+        #     for _rect in list_rect:
			
 
				+        #         _y0 = _rect.bbox[3]
			
 
				+        #         _y1 = _rect.bbox[1]
			
 
				+        #         _find = False
			
 
				+        #         for l_cr in clusters_rects:
			
 
				+        #             if abs(l_cr[0].bbox[3] - _y0) < margin:
			
 
				+        #                 _find = True
			
 
				+        #                 l_cr.append(_rect)
			
 
				+        #                 break
			
 
				+        #         if not _find:
			
 
				+        #             clusters_rects.append([_rect])
			
 
				+        # else:
			
 
				+        list_rect.sort(key=lambda x: x.bbox[1])
			
 
				+        for _rect in list_rect:
			
 
				+            _y0 = _rect.bbox[1]
			
 
				+            _y1 = _rect.bbox[3]
			
 
				+            _find = False
			
 
				+            for l_cr in clusters_rects:
			
 
				+                if abs(l_cr[0].bbox[1] - _y0) < margin:
			
 
				+                    _find = True
			
 
				+                    l_cr.append(_rect)
			
 
				+                    break
			
 
				+            if not _find:
			
 
				+                clusters_rects.append([_rect])
			
 
				 
			
 
				         # print("textbox:===================")
			
 
				         # for _textbox in list_textbox:
			
@@ -1199,11 +1223,12 @@ class LineTable:
 
				         list_y = list(set_y)
			
 
				 
			
 
				         list_x.sort(key=lambda x: x)
			
 
				-        list_y.sort(key=lambda x: x, reverse=sourceP_LB)
			
 
				+        # list_y.sort(key=lambda x: x, reverse=sourceP_LB)
			
 
				+        list_y.sort(key=lambda x: x)
			
 
				 
			
 
				         # print("clusters_rects", len(clusters_rects))
			
 
				-        if sourceP_LB:
			
 
				-            clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
			
 
				+        # if sourceP_LB:
			
 
				+        #     clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
			
 
				         clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
			
 
				 
			
 
				         for l_cr in clusters_rects:
			
@@ -1249,7 +1274,7 @@ class LineTable:
 
				             _table.append(table_line)
			
 
				         return _table, list_x, list_y
			
 
				 
			
 
				-    def rect2table(self, list_textbox, list_rect, in_objs, margin=5, sourceP_LB=True):
			
 
				+    def rect2table(self, list_textbox, list_rect, in_objs, margin=5, sourceP_LB=False):
			
 
				 
			
 
				         def getIOU(bbox0, bbox1):
			
 
				             width = max(bbox0[2], bbox1[2]) - min(bbox0[0], bbox1[0]) - (bbox0[2] - bbox0[0] + bbox1[2] - bbox1[0])
			
@@ -1265,8 +1290,9 @@ class LineTable:
 
				             # 打印_table
			
 
				             temp_list = []
			
 
				             for t in _table:
			
 
				+                print('------ makeTableByRect row ------')
			
 
				                 for c in t:
			
 
				-                    print(c)
			
 
				+                    print('makeTableByRect col', c)
			
 
				                     temp_list.append(c)
			
 
				             self._plot([], [], temp_list, title='makeTableByRect table')
			
 
				 
			
@@ -1274,10 +1300,10 @@ class LineTable:
 
				             return
			
 
				 
			
 
				         # pdf纯文本上下颠倒，pdf图片不颠倒
			
 
				-        if self.is_reverse:
			
 
				-            _table.sort(key=lambda x: (-x[0].get('bbox')[1], -x[0].get('bbox')[3]))
			
 
				-        else:
			
 
				-            _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
			
 
				+        # if self.is_reverse:
			
 
				+        #     _table.sort(key=lambda x: (-x[0].get('bbox')[1], -x[0].get('bbox')[3]))
			
 
				+        # else:
			
 
				+        _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
			
 
				 
			
 
				         self.feedText2table(_table, list_textbox, in_objs, sourceP_LB)
			
 
				 
			
@@ -1297,12 +1323,20 @@ class LineTable:
 
				 
			
 
				         self.fixRect(_table, list_x, list_y, sourceP_LB, margin)
			
 
				 
			
 
				+        # pdf纯文本上下颠倒，pdf图片不颠倒
			
 
				+        # if self.is_reverse:
			
 
				+        #     _table.sort(key=lambda x: (-x[0].get('bbox')[1], -x[0].get('bbox')[3]))
			
 
				+        # else:
			
 
				+        _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
			
 
				+
			
 
				+
			
 
				         if self.show:
			
 
				             # 打印_table
			
 
				             temp_list = []
			
 
				             for t in _table:
			
 
				+                print('------ fixRect row ------')
			
 
				                 for c in t:
			
 
				-                    print(c)
			
 
				+                    print('fixRect col', c)
			
 
				                     temp_list.append(c)
			
 
				             self._plot([], [], temp_list, title='fixRect table')