Przeglądaj źródła

Merge branch 'master' of http://192.168.2.103:3000/fangjiasheng/FORMAT_CONVERSION_MAXCOMPUTE

znj 1 rok temu
rodzic
commit
932a19a6e1
2 zmienionych plików z 171 dodań i 53 usunięć
  1. 92 8
      format_convert/convert_pdf.py
  2. 79 45
      format_convert/utils.py

+ 92 - 8
format_convert/convert_pdf.py

@@ -426,7 +426,7 @@ class PDFConvert:
 
     def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
         list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list,
-                                                                                    from_pdf=True, is_reverse=True)
+                                                                                    from_pdf=True, is_reverse=False)
         self._page.in_table_objs = filter_objs
 
         # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
@@ -444,7 +444,7 @@ class PDFConvert:
             _sen = _Sentence(sentence.text, sentence.bbox)
             self._page.add_child(_sen)
         # pdf对象需反向排序
-        self._page.is_reverse = True
+        # self._page.is_reverse = True
 
         return list_tables
 
@@ -577,6 +577,23 @@ class PDFConvert:
             self._page.error_code = layout
             return
 
+        # 翻转pdf中所有对象的y坐标
+        max_y, min_y = 0, 10000
+        for x in layout:
+            min_y = min(min_y, x.y0, x.y1)
+            max_y = max(max_y, x.y0, x.y1)
+        if max_y == 0:
+            return
+        for x in layout:
+            # 外层obj的bbox设置
+            x.set_bbox((x.x0, round(max_y - max(x.y0, x.y1), 1), x.x1, round(max_y - min(x.y0, x.y1), 1)))
+            # 内层单个字符的bbox设置
+            if isinstance(x, (LTTextBoxHorizontal, LTTextBoxVertical)):
+                for lt_text_line in x:
+                    for lt_char in lt_text_line:
+                        if isinstance(lt_char, LTChar):
+                            lt_char.set_bbox((lt_char.x0, round(max_y - max(lt_char.y0, lt_char.y1), 1), lt_char.x1, round(max_y - min(lt_char.y0, lt_char.y1), 1)))
+
         # 判断该页的对象类型,并存储
         lt_text_list = []
         lt_image_list = []
@@ -675,7 +692,8 @@ class PDFConvert:
                     traceback.print_exc()
 
             # pdf对象需反向排序
-            self._page.is_reverse = True
+            # self._page.is_reverse = True
+
             if self.has_init_pdf[3] == 0:
                 self.init_package("pdfplumber")
 
@@ -699,7 +717,7 @@ class PDFConvert:
                 else:
                     _image = _Image(page_image[1], page_image[0])
                     _image.is_from_pdf = True
-                    _image.is_reverse = True
+                    # _image.is_reverse = True
                     _image.b_table_from_text = True
                     _image.b_table_text_obj_list = lt_text_list
                     _image.b_table_layout_size = (layout.width, layout.height)
@@ -794,7 +812,7 @@ class PDFConvert:
             self._doc.add_child(self._page)
         log('get_all_page_image cost: ' + str(time.time()-start_time))
 
-    def connect_table(self, html_list):
+    def connect_table(self, html_list, show=0):
         if not html_list:
             return html_list
 
@@ -807,6 +825,7 @@ class PDFConvert:
         # 1.5: A后有文字(除了页码还有页眉),且A的后面只有一行且中文不超过15个字
         connect_flag_list = []
         soup_list = []
+        connect_rule_dict = {}
         for i, h in enumerate(html_list):
             soup = BeautifulSoup(h, 'lxml')
             soup_list.append(soup)
@@ -819,6 +838,10 @@ class PDFConvert:
                 match = re.finditer('</table>', h[last_table_start:])
                 for m in match:
                     last_table_end = m.span()[1] + last_table_start
+
+            # 补充规则,把表格也带上
+            rule_a = [0, h[last_table_start:last_table_end]]
+
             # 最后一个表格后有无除了页码外的内容
             connect_flag1 = False
             if last_table_end is not None:
@@ -840,6 +863,14 @@ class PDFConvert:
             for m in match:
                 first_table_start = m.span()[0]
                 break
+            if first_table_start is not None:
+                match = re.finditer('</table>', h[first_table_start:])
+                for m in match:
+                    first_table_end = m.span()[1] + first_table_start
+
+            # 补充规则,把表格也带上
+            rule_b = [0, h[first_table_start:first_table_end]]
+
             # 第一个表格前有无内容
             connect_flag2 = False
             if first_table_start is not None and first_table_start == 0:
@@ -858,6 +889,7 @@ class PDFConvert:
                         # 文字大于60且第一个为空
                         if not connect_flag2 and len(h[:first_table_start]) <= 60 and col_text_len_list[0] == 0 and max(col_text_len_list) >= 30:
                             connect_flag2 = True
+                            rule_b[0] = 1
                         # 有文字格子数占一半一下且第一个格子为空
                         if not connect_flag2 and col_text_len_list.count(0) >= len(col_text_len_list) / 2 and col_text_len_list[0] == 0:
                             connect_flag2 = True
@@ -870,8 +902,11 @@ class PDFConvert:
                         #     connect_flag2 = True
 
             connect_flag_list.append([i, connect_flag2, connect_flag1])
+            connect_rule_dict[i] = [rule_b, rule_a]
 
-        print('connect_flag_list', connect_flag_list)
+        if show:
+            print('connect_flag_list', connect_flag_list)
+            print('connect_rule_dict', connect_rule_dict)
 
         # 根据条件1合并需连接页码,形成组
         connect_pages_list = []
@@ -889,7 +924,8 @@ class PDFConvert:
             if temp_list:
                 connect_pages_list.append(temp_list)
 
-        print('connect_pages_list', connect_pages_list)
+        if show:
+            print('connect_pages_list', connect_pages_list)
 
         # 判断后续条件:判断组内列数是否相同
         connect_pages_list2 = []
@@ -942,7 +978,55 @@ class PDFConvert:
                 if new_c_list:
                     connect_pages_list2.append(new_c_list)
 
-        print('connect_pages_list2', connect_pages_list2)
+        if show:
+            print('connect_pages_list2', connect_pages_list2)
+
+        # 判断连接的两个表格是否需要补单元格内容
+        for c_list in connect_pages_list2:
+            for i in range(len(c_list)-1):
+                page_index1 = c_list[i][0]
+                page_index2 = c_list[i+1][0]
+                html2 = html_list[page_index2]
+                soup2 = soup_list[page_index2]
+                rule1 = connect_rule_dict.get(page_index1)[1]
+                rule2 = connect_rule_dict.get(page_index2)[0]
+                # print('rule1', rule1)
+                # if rule2[0]:
+                table1 = BeautifulSoup(rule1[1], 'lxml').findAll('table')[0]
+                table2 = BeautifulSoup(rule2[1], 'lxml').findAll('table')[0]
+                add_td_value = []
+                # 获取最后一行td
+                for tr in table1.findAll('tr')[::-1]:
+                    temp_list = []
+                    for td in tr.findAll('td'):
+                        temp_list.append(td.get_text())
+                    add_td_value = temp_list
+                    break
+                # print('add_td_value', add_td_value)
+                tr_index = 0
+                for tr in table2.findAll('tr'):
+                    temp_list = []
+                    for td in tr.findAll('td'):
+                        if len(td.get_text()) < 1:
+                            temp_list.append(0)
+                        else:
+                            temp_list.append(1)
+                    # print('temp_list', temp_list)
+                    if temp_list and add_td_value and len(temp_list) == len(add_td_value) \
+                            and 1 in temp_list and temp_list[0] != 1 \
+                            and 1 not in temp_list[:temp_list.index(1)]:
+                        for j in range(len(temp_list)):
+                            if temp_list[j] == 0:
+                                tr.findAll('td')[j].string = add_td_value[j]
+                            # else:
+                            #     # 只有第一行,且列数大于3,且只有一列有值情况下,上下两行文本合并
+                            #     if tr_index == 0 and len(temp_list) >= 3 and temp_list.count(1) == 1:
+                            #         tr.findAll('td')[j].string += add_td_value[j]
+                        # print('tr.findAll(td)[0]', tr.findAll('td')[0])
+                    tr_index += 1
+
+                soup2.findAll('table')[0].replace_with(table2)
+                html_list[page_index2] = str(soup2)
 
         # 符合连接条件的拼接表格
         new_html_list = []

+ 79 - 45
format_convert/utils.py

@@ -355,7 +355,7 @@ def slash_replace(_str, reverse=False):
 
 
 class LineTable:
-    def recognize_table(self, list_textbox, list_line, sourceP_LB=True,
+    def recognize_table(self, list_textbox, list_line, sourceP_LB=False,
                         splited=False, from_pdf=False, is_reverse=False, show=0):
         self.list_line = list_line
         self.list_crosspoints = self.recognize_crosspoints(list_line)
@@ -938,24 +938,24 @@ class LineTable:
             if len(_line) > 0:
                 _bbox = _line[0].get("bbox")
                 # check if has lap
-                if (min(_bbox[1], _bbox[3]) > max(bbox[1], bbox[3]) or max(_bbox[1], _bbox[3]) < min(bbox[1], bbox[3])):
+                if min(_bbox[1], _bbox[3]) > max(bbox[1], bbox[3]) or max(_bbox[1], _bbox[3]) < min(bbox[1], bbox[3]):
                     # if abs(min(_bbox[1],_bbox[3])-min(bbox[1],bbox[3]))>margin or abs(max(_bbox[1],_bbox[3])-max(bbox[1],bbox[3]))>margin:
                     #     print(_bbox)
                     #     print(bbox)
-                    # print("check position y false")
+                    print("check position y false", _bbox, bbox)
                     return False
             # check x
             if _position <= len(_line) - 1:
                 after_bbox = _line[_position].get("bbox")
                 # the insert bbox.x1 should not less then the after bbox.x0
                 if not (after_bbox[0] >= bbox[2]):
-                    # print("check position x after false")
+                    # print("check position x after false 1")
                     return False
-            if _position - 1 > 0 and _position - 1 < len(_line):
+            if 0 < _position - 1 < len(_line):
                 before_bbox = _line[_position - 1].get("bbox")
                 # the insert bbox.x1 should less equal than the first bbox.x0
                 if not (bbox[0] >= before_bbox[2]):
-                    # print("check position x before false")
+                    # print("check position x before false 2")
                     return False
             return True
 
@@ -994,22 +994,40 @@ class LineTable:
             while c_i < len(_line):
                 _cell = _line[c_i]
                 if _cell.get("rowspan") > 1:
+                    # print('_cell', _cell)
                     x0, y0, x1, y1 = _cell.get("bbox")
                     _rospan = _cell.get("rowspan")
                     locations = self.getSpanLocation(list_y, y0, y1, 10)
+                    # print('locations', locations)
 
                     if len(locations) == _rospan + 1:
-                        _cell["bbox"] = (x0, y0, x1, locations[1])
+                        if self.is_reverse:
+                            _cell["bbox"] = (x0, locations[-2], x1, y0)
+                        else:
+                            _cell["bbox"] = (x0, y0, x1, locations[1])
                         _cell["rowspan"] = 1
 
+                        # print('_cell1', _cell)
+
                         for i in range(1, _rospan):
                             n_cell = {}
                             n_cell.update(_cell)
+                            # if not self.is_reverse:
                             if l_i + i <= len(_table) - 1:
                                 # print(len(_table),l_i+i)
                                 n_cell["bbox"] = (x0, locations[i], x1, locations[i + 1])
+                                # print('n_cell', n_cell)
                                 if checkPosition(_table[l_i + i], c_i, n_cell["bbox"]):
+                                    # print('n_cell1', n_cell)
                                     _table[l_i + i].insert(c_i, n_cell)
+                            # else:
+                            #     if l_i - i >= 0:
+                            #         # print(len(_table),l_i+i)
+                            #         n_cell["bbox"] = (x0, locations[i], x1, locations[i + 1])
+                            #         print('n_cell', n_cell)
+                            #         if checkPosition(_table[l_i - i], c_i, n_cell["bbox"]):
+                            #             print('n_cell1', n_cell)
+                            #             _table[l_i - i].insert(c_i, n_cell)
                 c_i += 1
 
     def fixRect(self, _table, list_x, list_y, sourceP_LB, margin):
@@ -1019,12 +1037,16 @@ class LineTable:
         #         _cell = _table[line_i][cell_i]
         #         print(line_i,cell_i,_cell["bbox"],_cell["text"])
         for _line in _table:
+            _line.sort(key=lambda x: x.get('bbox')[0])
+            # print('_line', _line)
             extend_line = []
             for c_i in range(len(_line)):
                 c_cell = _line[c_i]
 
                 # first cell missing
                 if c_i == 0 and c_cell["bbox"][0] != list_x[0]:
+                    # print('c_cell', c_cell)
+                    # print('list_x', list_x)
                     _bbox = (list_x[0], c_cell["bbox"][1], c_cell["bbox"][0], c_cell["bbox"][3])
                     _cell = {"bbox": _bbox,
                              "rect": LTRect(1, _bbox),
@@ -1103,7 +1125,8 @@ class LineTable:
             # 分行,根据y重合
             all_match_box_list = []
 
-            inbox_textbox_list.sort(key=lambda x: x.bbox[1], reverse=sourceP_LB)
+            # inbox_textbox_list.sort(key=lambda x: x.bbox[1], reverse=sourceP_LB)
+            inbox_textbox_list.sort(key=lambda x: x.bbox[1])
             for i in range(len(inbox_textbox_list)):
                 match_box_list = []
                 box1 = inbox_textbox_list[i]
@@ -1132,7 +1155,8 @@ class LineTable:
                 all_match_box_list.append(match_box_list)
 
             # print("match_box_list", all_match_box_list)
-            all_match_box_list.sort(key=lambda x: (round(x[0][2] + x[0][4]) / 2, 0), reverse=sourceP_LB)
+            # all_match_box_list.sort(key=lambda x: (round(x[0][2] + x[0][4]) / 2, 0), reverse=sourceP_LB)
+            all_match_box_list.sort(key=lambda x: (round(x[0][2] + x[0][4]) / 2, 0))
             for box_list in all_match_box_list:
                 for box in box_list:
                     _cell["text"] += re.sub("\s", '', box[0])
@@ -1148,32 +1172,32 @@ class LineTable:
 
         clusters_rects = []
         # 根据y1聚类
-        if sourceP_LB:
-            list_rect.sort(key=lambda x: x.bbox[3])
-            for _rect in list_rect:
-                _y0 = _rect.bbox[3]
-                _y1 = _rect.bbox[1]
-                _find = False
-                for l_cr in clusters_rects:
-                    if abs(l_cr[0].bbox[3] - _y0) < margin:
-                        _find = True
-                        l_cr.append(_rect)
-                        break
-                if not _find:
-                    clusters_rects.append([_rect])
-        else:
-            list_rect.sort(key=lambda x: x.bbox[1])
-            for _rect in list_rect:
-                _y0 = _rect.bbox[1]
-                _y1 = _rect.bbox[3]
-                _find = False
-                for l_cr in clusters_rects:
-                    if abs(l_cr[0].bbox[1] - _y0) < margin:
-                        _find = True
-                        l_cr.append(_rect)
-                        break
-                if not _find:
-                    clusters_rects.append([_rect])
+        # if sourceP_LB:
+        #     list_rect.sort(key=lambda x: x.bbox[3])
+        #     for _rect in list_rect:
+        #         _y0 = _rect.bbox[3]
+        #         _y1 = _rect.bbox[1]
+        #         _find = False
+        #         for l_cr in clusters_rects:
+        #             if abs(l_cr[0].bbox[3] - _y0) < margin:
+        #                 _find = True
+        #                 l_cr.append(_rect)
+        #                 break
+        #         if not _find:
+        #             clusters_rects.append([_rect])
+        # else:
+        list_rect.sort(key=lambda x: x.bbox[1])
+        for _rect in list_rect:
+            _y0 = _rect.bbox[1]
+            _y1 = _rect.bbox[3]
+            _find = False
+            for l_cr in clusters_rects:
+                if abs(l_cr[0].bbox[1] - _y0) < margin:
+                    _find = True
+                    l_cr.append(_rect)
+                    break
+            if not _find:
+                clusters_rects.append([_rect])
 
         # print("textbox:===================")
         # for _textbox in list_textbox:
@@ -1199,11 +1223,12 @@ class LineTable:
         list_y = list(set_y)
 
         list_x.sort(key=lambda x: x)
-        list_y.sort(key=lambda x: x, reverse=sourceP_LB)
+        # list_y.sort(key=lambda x: x, reverse=sourceP_LB)
+        list_y.sort(key=lambda x: x)
 
         # print("clusters_rects", len(clusters_rects))
-        if sourceP_LB:
-            clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
+        # if sourceP_LB:
+        #     clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
         clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
 
         for l_cr in clusters_rects:
@@ -1249,7 +1274,7 @@ class LineTable:
             _table.append(table_line)
         return _table, list_x, list_y
 
-    def rect2table(self, list_textbox, list_rect, in_objs, margin=5, sourceP_LB=True):
+    def rect2table(self, list_textbox, list_rect, in_objs, margin=5, sourceP_LB=False):
 
         def getIOU(bbox0, bbox1):
             width = max(bbox0[2], bbox1[2]) - min(bbox0[0], bbox1[0]) - (bbox0[2] - bbox0[0] + bbox1[2] - bbox1[0])
@@ -1265,8 +1290,9 @@ class LineTable:
             # 打印_table
             temp_list = []
             for t in _table:
+                print('------ makeTableByRect row ------')
                 for c in t:
-                    print(c)
+                    print('makeTableByRect col', c)
                     temp_list.append(c)
             self._plot([], [], temp_list, title='makeTableByRect table')
 
@@ -1274,10 +1300,10 @@ class LineTable:
             return
 
         # pdf纯文本上下颠倒,pdf图片不颠倒
-        if self.is_reverse:
-            _table.sort(key=lambda x: (-x[0].get('bbox')[1], -x[0].get('bbox')[3]))
-        else:
-            _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
+        # if self.is_reverse:
+        #     _table.sort(key=lambda x: (-x[0].get('bbox')[1], -x[0].get('bbox')[3]))
+        # else:
+        _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
 
         self.feedText2table(_table, list_textbox, in_objs, sourceP_LB)
 
@@ -1297,12 +1323,20 @@ class LineTable:
 
         self.fixRect(_table, list_x, list_y, sourceP_LB, margin)
 
+        # pdf纯文本上下颠倒,pdf图片不颠倒
+        # if self.is_reverse:
+        #     _table.sort(key=lambda x: (-x[0].get('bbox')[1], -x[0].get('bbox')[3]))
+        # else:
+        _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
+
+
         if self.show:
             # 打印_table
             temp_list = []
             for t in _table:
+                print('------ fixRect row ------')
                 for c in t:
-                    print(c)
+                    print('fixRect col', c)
                     temp_list.append(c)
             self._plot([], [], temp_list, title='fixRect table')