Ver Fonte

增加特定页提取

fangjiasheng há 1 ano atrás
pai
commit
39c37a40ed

+ 136 - 93
format_convert/convert_image.py

@@ -17,7 +17,7 @@ import traceback
 import cv2
 from isr.pre_process import count_red_pixel
 from format_convert.utils import judge_error_code, add_div, LineTable, get_table_html, get_logger, log, \
-    memory_decorator, pil_resize, np2bytes, ocr_cant_read, get_garble_code2, line_iou
+    memory_decorator, pil_resize, np2bytes, ocr_cant_read, get_garble_code2, line_iou, image_rotate
 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface, from_gpu_interface_redis, \
     from_idc_interface, from_isr_interface
 from format_convert.table_correct import get_rotated_image
@@ -25,7 +25,7 @@ from botr.extract_table import get_table
 
 
 def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
-                  b_table_from_text=False, pdf_obj_list=[], pdf_layout_size=()):
+                  b_table_from_text=False, pdf_obj_list=[], pdf_layout_size=(), is_reverse=False):
     from format_convert.convert_tree import _Table, _Sentence
 
     def get_cluster(t_list, b_list, axis):
@@ -88,6 +88,15 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                 textbox_list.remove(_obj)
         return textbox_list
 
+    def resize_process(_image_np):
+        # 整体分辨率限制
+        threshold = 2048
+        if _image_np.shape[0] > threshold or _image_np.shape[1] > threshold:
+            h, w = get_best_predict_size2(_image_np, threshold=threshold)
+            log("global image resize " + str(_image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
+            _image_np = pil_resize(_image_np, h, w)
+        return _image_np
+
     def idc_process(_image_np, return_angle=False):
         # 图片倾斜校正,写入原来的图片路径
         # print("image_process", image_path)
@@ -128,8 +137,10 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                 else:
                     return angle
         # 根据角度旋转
-        _image_pil = Image.fromarray(_image_np)
-        _image_np = np.array(_image_pil.rotate(angle, expand=1))
+        # _image_pil = Image.fromarray(_image_np)
+        # _image_np = np.array(_image_pil.rotate(angle, expand=1))
+        _image_np = image_rotate(_image_np, angle)
+
         # 写入
         # idc_path = image_path.split(".")[0] + "_idc." + image_path.split(".")[-1]
         # cv2.imwrite(idc_path, image_np)
@@ -341,14 +352,16 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                 lt = LineTable()
                 tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
                                                                                    sourceP_LB=False, splited=False,
-                                                                                   from_pdf=is_from_pdf)
+                                                                                   from_pdf=is_from_pdf,
+                                                                                   is_reverse=is_reverse)
                 # 需分割textbox
                 if connect_textbox_list:
                     list_text_boxes = table_textbox_split(_image_np, connect_textbox_list, list_text_boxes)
                     # 新的textbox,重新做表格
                     tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
                                                                                        sourceP_LB=False, splited=True,
-                                                                                       from_pdf=is_from_pdf)
+                                                                                       from_pdf=is_from_pdf,
+                                                                                       is_reverse=is_reverse)
 
                 if not tables:
                     return list_text_boxes, tables, obj_in_table
@@ -359,6 +372,42 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
             traceback.print_exc()
             return [-8], [-8], [-8]
 
+    def slice_process(_image_np):
+        slice_flag = need_image_slice(image_np)
+        log("need_image_slice " + str(slice_flag) + " " + str(image_np.shape))
+        _image_np_list = [_image_np]
+        if slice_flag:
+            # 长图分割
+            _image_np_list = image_slice_new(_image_np)
+            angle_dict = {}
+            for im in _image_np_list:
+                _, angle = idc_process(im, return_angle=True)
+                if angle in [0, 360]:
+                    angle = 0
+                if angle in angle_dict.keys():
+                    angle_dict[angle] += 1
+                else:
+                    angle_dict[angle] = 1
+
+            # idc不太准,有0度就直接使用
+            if 0 in angle_dict.keys():
+                log('image_slice 0 in angle_dict')
+                angle = 0
+            else:
+                angle_list = [[key, value] for key, value in angle_dict.items()]
+                angle_list.sort(key=lambda x: x[1])
+                log('image_slice angle_list ' + str(angle_list))
+                angle = angle_list[-1][0]
+            for i in range(len(_image_np_list)):
+                _image_np_list[i] = image_rotate(_image_np_list[i], angle)
+            if angle in [180]:
+                _image_np_list.reverse()
+
+        if len(_image_np_list) < 1:
+            log("image_slice failed!")
+            _image_np_list = [_image_np]
+        return _image_np_list
+
     def get_text_box_obj(_text_list, _bbox_list):
         from format_convert.convert_tree import TextBox
         _text_box_list = []
@@ -489,36 +538,16 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
         if not b_table_from_text:
             # 判断是否需要长图分割
-            slice_flag = need_image_slice(image_np)
-            log("need_image_slice " + str(slice_flag) + " " + str(image_np.shape))
             idc_flag = False
-            image_np_list = [image_np]
-            if slice_flag:
-                # 方向分类
-                image_np = idc_process(image_np)
+            image_np_list = slice_process(image_np)
+            if len(image_np_list) > 1:
                 idc_flag = True
-                if isinstance(image_np, list):
-                    return image_np
-
-                # 再判断
-                if need_image_slice(image_np):
-                    # 长图分割
-                    image_np_list = image_slice_new(image_np)
-            if len(image_np_list) < 1:
-                log("image_slice failed!")
-                image_np_list = [image_np]
-                # return [-10]
 
-            all_obj_list = []
-            _add_y = 0
+            reverse_flag = 0
+            table_textbox_list = []
             for image_np in image_np_list:
-                # print("sub image shape", image_np.shape)
                 # 整体分辨率限制
-                threshold = 2048
-                if image_np.shape[0] > threshold or image_np.shape[1] > threshold:
-                    h, w = get_best_predict_size2(image_np, threshold=threshold)
-                    log("global image resize " + str(image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
-                    image_np = pil_resize(image_np, h, w)
+                image_np = resize_process(image_np)
 
                 # 印章去除
                 image_np = isr_process(image_np)
@@ -527,21 +556,23 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
                 # 文字识别
                 text_list, box_list = ocr_process(image_np)
-                # print('text_list', text_list)
                 if judge_error_code(text_list):
                     return text_list
 
                 # 判断ocr识别是否正确
+                print('ocr_cant_read(text_list, box_list)', ocr_cant_read(text_list, box_list), idc_flag)
                 if ocr_cant_read(text_list, box_list) and not idc_flag:
-                # if True:
                     # 方向分类
                     image_np, angle = idc_process(image_np, return_angle=True)
                     if isinstance(image_np, list):
                         return image_np
                     # 如果角度不变,旋转180
                     if angle in [0, 360]:
-                        image_pil = Image.fromarray(image_np)
-                        image_np = np.array(image_pil.rotate(180, expand=1))
+                        print('ocr_cant_read image_rotate 180')
+                        # image_np = image_rotate(image_np, angle=180)
+                        reverse_flag = 1
+                        # image_pil = Image.fromarray(image_np)
+                        # image_np = np.array(image_pil.rotate(180, expand=1))
                     # cv2.imshow("idc_process", image_np)
                     # cv2.waitKey(0)
 
@@ -550,10 +581,6 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                     if judge_error_code(text_list1):
                         return text_list1
 
-                    # all_text = ''.join(text_list1)
-                    # all_text = re.sub('[\s\d]', '', all_text)
-                    # if len(re.findall(get_garble_code2(), all_text)) >= 2:
-                    # log('text_list1' + ''.join(text_list1))
                     if len(text_list1) > 0 and ocr_cant_read(text_list1, box_list_1) and is_from_pdf:
                         return [-16]
 
@@ -598,17 +625,37 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                 # 合并非表格的同一行TextBox
                 text_box_list = merge_textbox(text_box_list, obj_in_table_list)
 
-                # 对象生成
+                table_textbox_list.append([table_list, b_table_list, obj_in_table_list, text_box_list])
+
+            if reverse_flag:
+                table_textbox_list.reverse()
+
+                for i in range(len(image_np_list)):
+                    image_np_list[i] = image_rotate(image_np_list[i], angle=180)
+                image_np_list.reverse()
+
+            # index = 0
+            # for image_np in image_np_list:
+            #     cv2.imshow(str(index) + '.jpg', image_np)
+            #     cv2.waitKey(0)
+            #     index += 1
+
+            # 对象生成
+            all_obj_list = []
+            _add_y = 0
+            for table_list, b_table_list, obj_in_table_list, text_box_list in table_textbox_list:
                 obj_list = []
                 for table in table_list:
-                    _table = _Table(table["table"], table["bbox"])
+                    _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
+                    _table = _Table(table["table"], _table_bbox)
                     obj_list.append(_table)
                 for table in b_table_list:
-                    _table = _Table(table["table"], table["bbox"])
+                    _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
+                    _table = _Table(table["table"], _table_bbox)
                     obj_list.append(_table)
-                    _table.y += 10000
                 for text_box in text_box_list:
                     if text_box not in obj_in_table_list:
+                        text_box.bbox[1] += _add_y
                         obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
 
                 # 多图修正y
@@ -618,11 +665,12 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                         obj.y += _add_y
                         list_y.append(obj.y)
                     if len(list_y) > 0:
-                        _add_y = max(list_y)
+                        _add_y += max(list_y)
 
                 # 合并
                 all_obj_list += obj_list
 
+        # 无边框表格图片
         else:
             all_obj_list = []
             table_list = []
@@ -799,57 +847,48 @@ def image_slice_new(image_np):
     # cv2.waitKey(0)
     # cv2.imwrite("error.jpg", dilation)
 
-    # 按行求平均
-    width_avg = np.average(np.float32(dilation), axis=1)
-    zero_index = np.where(width_avg == 0.)[0]
-    # print(height, width)
-    # print(width_avg)
-    # print(width_avg.shape)
-    # print(zero_index)
-    # print(zero_index.shape)
-    # zero_index.sort(key=lambda x: x)
-
-    # 截取范围内寻找分割点
-    max_distance = int(width / 2)
-    image_list = []
-    last_h = 0
-    for i in range(height // width + 1):
-        h = last_h + width
-
-        # 前后的分割点
-        zero_h_after = zero_index[np.where(zero_index >= h)]
-        zero_h_before = zero_index[np.where(zero_index <= h)]
-        # print("last_h, h", last_h, h)
-        # print("last_h, h", last_h, h)
-        # print(zero_index.shape)
-        # print("zero_h_after.shape", zero_h_after.shape)
-        if zero_h_after.shape[0] == 0:
-            # 最后一截
-            last_image = image_origin[last_h:, :, :]
-            if last_image.shape[0] <= max_distance:
-                image_list[-1] = np.concatenate([image_list[-1], last_image], axis=0)
-            else:
-                image_list.append(last_image)
-            break
-
-        # 分割点距离不能太远
-        cut_h = zero_h_after.tolist()[0]
-        # print("cut_h", cut_h)
-        if abs(h - cut_h) <= max_distance:
-            image_list.append(image_origin[last_h:cut_h, :, :])
-            last_h = cut_h
-        # 后面找不到往前找
+    # 预定义切割处
+    slice_time = height // (width)
+    slice_index_list = []
+    for i in range(slice_time):
+        if i < slice_time-1:
+            slice_index = width + i * width
         else:
-            cut_h = zero_h_before.tolist()[-1]
-            if abs(cut_h - h) <= max_distance:
-                image_list.append(image_origin[last_h:cut_h, :, :])
-                last_h = cut_h
+            slice_index = height
+        slice_index_list.append(slice_index)
+
+    # 在预定义切割处上下寻找合适的实际切割处
+    max_distance = int(width / 4)
+    real_slice_index_list = []
+    for i in range(len(slice_index_list)):
+        slice_index = slice_index_list[i]
+
+        if i == len(slice_index_list) - 1:
+            real_slice_index_list.append(int(slice_index))
+            continue
+
+        sub_dilation = dilation[slice_index-max_distance:slice_index+max_distance, :]
+        # 按行求平均
+        width_avg = np.average(np.float32(sub_dilation), axis=1)
+        # 取最小的
+        width_min_avg_index = np.argsort(width_avg, axis=0)[0]
+        # width_min_avg = width_avg[width_min_avg_index] + slice_index - max_distance
+        width_min_avg = width_min_avg_index + slice_index - max_distance
+        real_slice_index_list.append(int(width_min_avg))
+
+    # 切割
+    image_list = []
+    last_slice_index = 0
+    print('real_slice_index_list', real_slice_index_list)
+    for slice_index in real_slice_index_list:
+        image_list.append(image_origin[last_slice_index:slice_index, :, :])
+        last_slice_index = slice_index
 
     # i = 0
     # for im in image_list:
-    #     print(im.shape)
-    #     cv2.imwrite("error" + str(i) + ".jpg", im)
-    #     i += 1
+    #     # print(im.shape)
+    #     # cv2.imwrite("error" + str(i) + ".jpg", im)
+    #     # i += 1
     #     cv2.namedWindow("im", 0)
     #     cv2.resizeWindow("im", 1000, 800)
     #     cv2.imshow("im", im)
@@ -863,7 +902,7 @@ def need_image_slice(image_np):
     h, w = image_np.shape[:2]
     # if h > 3000 and w < 2000:
     #     return True
-    if 2. <= h / w and w >= 100:
+    if 3. <= h / w and w >= 100:
         return True
     return False
 
@@ -887,11 +926,15 @@ def remove_black_border(img_np):
         colflag = np.argwhere(colc > threshold)
 
         left, bottom, right, top = rowflag[0, 0], colflag[-1, 0], rowflag[-1, 0], colflag[0, 0]
-
+        if left == right or top == bottom:
+            raise
         # cv2.imshow('remove_black_border', img_np[left:right, top:bottom, :])
         # cv2.waitKey()
+        log('remove_black_border success')
         return img_np[left:right, top:bottom, :]
     except:
+        log('remove_black_border failed')
+        traceback.print_exc()
         return img_np
 
 

+ 102 - 10
format_convert/convert_pdf.py

@@ -112,6 +112,12 @@ class PDFConvert:
         # 记录图片对象的md5,用于去除大量重复图片
         self.md5_image_obj_list = []
 
+        # 记录该页是不是纯文本
+        self.only_text_list = []
+
+        # 是否提取特殊页
+        self.convert_specific_page = 1
+
     @memory_decorator
     def init_package(self, package_name):
         # 各个包初始化
@@ -177,6 +183,7 @@ class PDFConvert:
         pages = PDFPage.create_pages(self.doc_pdfminer)
         pages = list(pages)
         page_count = len(pages)
+        self.only_text_list = [-1] * len(pages)
         page_no = 0
         for page in pages:
             # 指定pdf页码
@@ -208,8 +215,67 @@ class PDFConvert:
             self._doc.add_child(self._page)
             page_no += 1
 
+        self._doc.children, detete_header_footer_list = self.delete_header_footer(self._doc.children)
+
+        if self.convert_specific_page and self.need_page_no is None:
+            # 补充提取特定页
+            # print('self.only_text_list', self.only_text_list)
+            if self.only_text_list.count(0) == 0:
+                ratio = 0
+            else:
+                ratio = self.only_text_list.count(0) / (page_count-self.only_text_list.count(-1))
+            # print('ratio', ratio)
+            if page_count > limit_page_cnt and ratio <= 0.2:
+                page_no = 0
+                find_flag = 0
+                add_page_list = []
+                for page in pages:
+                    if not int(limit_page_cnt/2) <= page_no < page_count - int(limit_page_cnt/2):
+                        page_no += 1
+                        continue
+
+                    # 解析单页
+                    start_time = time.time()
+                    self._page = _Page(page, page_no)
+                    self.convert_page(page, page_no, skip_image=1)
+                    log('convert_page add page_no: ' + str(page_no) + ' cost: ' + str(time.time()-start_time))
+
+                    # 删除页眉页脚
+                    pages, _ = self.delete_header_footer([self._page], detete_header_footer_list)
+                    self._page = pages[0]
+
+                    # 提取特殊部分
+                    re_str = '采购清单'
+                    # 坐标都是上下颠倒的,回正
+                    self._page.children.sort(key=lambda x: x.y, reverse=True)
+
+                    # print('find_flag', find_flag, type(self._page.children[-1]))
+                    if find_flag and type(self._page.children[0]) == _Table:
+                        add_page_list.append(self._page)
+                        if len(self._page.children) - 1 > 3:
+                            find_flag = 0
+
+                    for index in range(len(self._page.children)):
+                        obj = self._page.children[index]
+                        next_obj = None
+                        if index+1 < len(self._page.children):
+                            next_obj = self._page.children[index+1]
+                        # print('采购清单', type(obj) == _Sentence, re.search(re_str, str(obj.content)), str(obj.content)[:20])
+                        if type(obj) == _Sentence and re.search(re_str, obj.content) \
+                                and next_obj and type(next_obj) == _Table:
+                            add_page_list.append(self._page)
+                            # print('add_page_list', page_no)
+                            if len(self._page.children) - index - 1 > 3:
+                                find_flag = 0
+                            else:
+                                find_flag = 1
+                    page_no += 1
+
+                # print('add_page_list', add_page_list)
+                if add_page_list:
+                    self._doc.children = self._doc.children[:int(limit_page_cnt/2)] + add_page_list + self._doc.children[int(limit_page_cnt/2):]
+
         self.delete_same_image()
-        self.delete_header_footer()
         # self.delete_bold_text_duplicate()
 
     def delete_same_image(self, show=0):
@@ -245,9 +311,9 @@ class PDFConvert:
                         cv2.imshow('page img_np', img_np)
                         cv2.waitKey(0)
 
-    def delete_header_footer(self):
+    def delete_header_footer(self, pages, delete_list=[]):
         sen_dict = {}
-        for page in self._doc.children:
+        for page in pages:
             for obj in page.children:
                 if isinstance(obj, _Sentence):
                     key = str(obj.content) + ' ' + str(int(obj.y))
@@ -256,10 +322,20 @@ class PDFConvert:
                         sen_dict[key] += [obj]
                     else:
                         sen_dict[key] = [obj]
+
+        # 把需删除的加上
+        # print('delete_list', delete_list)
+        for key in delete_list:
+            if key in sen_dict:
+                sen_dict[key] = sen_dict.get(key) * 10
+
+        # print('sen_dict', sen_dict)
+        delete_footer_header_list = []
         for key in sen_dict.keys():
             l = sen_dict.get(key)
-            if len(l) >= 2/3 * max(10, len(self._doc.children)):
-                for page in self._doc.children:
+            if len(l) >= 1/3 * max(10, len(pages)):
+                delete_footer_header_list.append(key)
+                for page in pages:
                     new_children = []
                     for obj in page.children:
                         if isinstance(obj, _Sentence):
@@ -268,10 +344,10 @@ class PDFConvert:
                         else:
                             new_children.append(obj)
                     page.children = new_children
-                print('len(l)', len(l), len(self._doc.children))
-                print('delete_header_footer l[0]', l[0].content, l[0].y)
+                # print('len(l)', len(l), len(pages))
+                # print('delete_header_footer l[0]', l[0].content, l[0].y)
 
-        return
+        return pages, delete_footer_header_list
 
     def delete_bold_text_duplicate(self, lt_text_box_list):
         # 拿出所有LTChar
@@ -349,7 +425,8 @@ class PDFConvert:
         return lt_line_list
 
     def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
-        list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list, from_pdf=True)
+        list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list,
+                                                                                    from_pdf=True, is_reverse=True)
         self._page.in_table_objs = filter_objs
 
         # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
@@ -492,7 +569,7 @@ class PDFConvert:
         log("page_no: " + str(page_no) + ' is_b_table_flag ' + str(is_b_table_flag))
         return is_b_table_flag
 
-    def convert_page(self, page, page_no):
+    def convert_page(self, page, page_no, skip_image=0):
         layout = self.get_layout(page, page_no)
         if self._doc.error_code is not None:
             return
@@ -517,6 +594,18 @@ class PDFConvert:
                             continue
                         lt_image_list.append(y)
 
+        # 判断纯文本
+        if len(lt_image_list) == 0 and len(lt_text_list) == 0:
+            self.only_text_list[page_no] = 0
+        elif len(lt_image_list) == 0:
+            self.only_text_list[page_no] = 1
+        else:
+            self.only_text_list[page_no] = 0
+
+        # 跳过图片
+        if skip_image:
+            lt_image_list = []
+
         # 判断读出来的是乱码,但有图片直接识别
         all_text = ''.join([x.get_text() for x in lt_text_list])
         all_text = re.sub('[\s\d]', '', all_text)
@@ -543,6 +632,7 @@ class PDFConvert:
             else:
                 _image = _Image(page_image[1], page_image[0])
                 _image.is_from_pdf = True
+                _image.is_reverse = False
                 self._page.add_child(_image)
 
         # 正常读取该页对象
@@ -564,6 +654,7 @@ class PDFConvert:
                         else:
                             _image = _Image(page_image[1], page_image[0])
                             _image.is_from_pdf = True
+                            _image.is_reverse = False
                             self._page.add_child(_image)
                             image_md5 = get_md5_from_bytes(page_image[1])
                             self.md5_image_obj_list.append([image_md5, _image])
@@ -608,6 +699,7 @@ class PDFConvert:
                 else:
                     _image = _Image(page_image[1], page_image[0])
                     _image.is_from_pdf = True
+                    _image.is_reverse = True
                     _image.b_table_from_text = True
                     _image.b_table_text_obj_list = lt_text_list
                     _image.b_table_layout_size = (layout.width, layout.height)

+ 3 - 1
format_convert/convert_tree.py

@@ -84,6 +84,8 @@ class _Image:
     def __init__(self, content, path, bbox=(0, 0, 0, 0)):
         self.content = content
         self.path = path
+        # 是否反向排序
+        self.is_reverse = False
         # 来源
         self.is_from_pdf = False
         self.is_from_docx = False
@@ -140,7 +142,7 @@ class _Image:
         image_np = cv2.imread(self.path)
         obj_list = image_process(image_np, self.path, self.is_from_pdf, self.is_from_docx,
                                  self.b_table_from_text, self.b_table_text_obj_list,
-                                 self.b_table_layout_size)
+                                 self.b_table_layout_size, self.is_reverse)
         if judge_error_code(obj_list):
             self.error_code = obj_list
             return

+ 15 - 4
format_convert/utils.py

@@ -339,12 +339,13 @@ def slash_replace(_str, reverse=False):
 
 class LineTable:
     def recognize_table(self, list_textbox, list_line, sourceP_LB=True,
-                        splited=False, from_pdf=False, show=0):
+                        splited=False, from_pdf=False, is_reverse=False, show=0):
         self.list_line = list_line
         self.list_crosspoints = self.recognize_crosspoints(list_line)
         self.from_pdf = from_pdf
         self.splited = splited
         self.connect_bbox_list = []
+        self.is_reverse = is_reverse
         self.show = show
 
         if self.show:
@@ -1044,8 +1045,6 @@ class LineTable:
 
             for _tmp in extend_line:
                 _line.insert(_tmp["index"], _tmp["cell"])
-        # 排序
-        _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
 
     def feedText2table(self, _table, list_textbox, in_objs, sourceP_LB):
 
@@ -1257,6 +1256,12 @@ class LineTable:
         if _table is None:
             return
 
+        # pdf纯文本上下颠倒,pdf图片不颠倒
+        if self.is_reverse:
+            _table.sort(key=lambda x: (-x[0].get('bbox')[1], -x[0].get('bbox')[3]))
+        else:
+            _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
+
         self.feedText2table(_table, list_textbox, in_objs, sourceP_LB)
 
         # print("table===========================>")
@@ -1273,7 +1278,6 @@ class LineTable:
         #     print("\n")
         # print("------------")
 
-        _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
         self.fixRect(_table, list_x, list_y, sourceP_LB, margin)
 
         if self.show:
@@ -2222,6 +2226,13 @@ def bbox_iou(bbox1, bbox2, contain=True):
     return iou
 
 
+def image_rotate(image_np, angle):
+    # 根据角度旋转
+    image_pil = Image.fromarray(image_np)
+    image_np = np.array(image_pil.rotate(angle, expand=1))
+    return image_np
+
+
 if __name__ == "__main__":
     # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
     # print(slash_replace(strs))

+ 9 - 6
ocr/tools/infer/predict_rec.py

@@ -81,13 +81,15 @@ class TextRecognizer(object):
         h, w = img.shape[:2]
         imgC, imgH, imgW = self.rec_image_shape
         assert imgC == img.shape[2]
-
+        # print('max_wh_ratio', max_wh_ratio)
         if max_wh_ratio < 0.1:
-            if h > imgW:
-                resized_image = cv2.resize(img, (w, imgW))
-            else:
-                resized_image = img
+            # if h > imgW:
+            #     resized_image = cv2.resize(img, (w, imgW))
+            # else:
+            #     resized_image = img
 
+            # max_wh_ratio h是w的10倍,直接跳过
+            resized_w = None
         else:
             if self.character_type == "ch":
                 imgW = int((32 * max_wh_ratio))
@@ -109,7 +111,8 @@ class TextRecognizer(object):
         resized_image -= 0.5
         resized_image /= 0.5
         padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
-        padding_im[:, :, 0:resized_w] = resized_image
+        if resized_w is not None:
+            padding_im[:, :, 0:resized_w] = resized_image
         return padding_im
 
     def resize_norm_img_srn(self, img, image_shape):