Browse Source

1. 新增wps类型
2. 新增ofd类型
3. 新增两列无边框表格识别
4. 修复ocr爆显存
5. pdf处理速度优化
6. 特殊康熙字体处理
7. 新增监控平均处理时间

fangjiasheng 9 months ago
parent
commit
ab202ff1fc
58 changed files with 12210 additions and 640 deletions
  1. 2452 16
      botr/extract_table.py
  2. 5 0
      botr/utils.py
  3. 1 1
      config/interface_new.yml
  4. 287 118
      format_convert/convert.py
  5. 94 22
      format_convert/convert_doc.py
  6. 205 18
      format_convert/convert_docx.py
  7. 819 25
      format_convert/convert_image.py
  8. 26 9
      format_convert/convert_need_interface.py
  9. 75 0
      format_convert/convert_ofd.py
  10. 75 0
      format_convert/convert_ofd_test.py
  11. 352 51
      format_convert/convert_pdf.py
  12. 30 36
      format_convert/convert_test.py
  13. 91 4
      format_convert/convert_tree.py
  14. 61 0
      format_convert/convert_wps.py
  15. 6 0
      format_convert/easyofd/easyofd/__init__.py
  16. 474 0
      format_convert/easyofd/easyofd/chinese_characters.txt
  17. 23 0
      format_convert/easyofd/easyofd/draw/__init__.py
  18. 290 0
      format_convert/easyofd/easyofd/draw/draw_ofd.py
  19. 1178 0
      format_convert/easyofd/easyofd/draw/draw_pdf.py
  20. 113 0
      format_convert/easyofd/easyofd/draw/find_seal_img.py
  21. 216 0
      format_convert/easyofd/easyofd/draw/font_tools.py
  22. 666 0
      format_convert/easyofd/easyofd/draw/ofdtemplate.py
  23. 966 0
      format_convert/easyofd/easyofd/draw/pdf_parse.py
  24. BIN
      format_convert/easyofd/easyofd/draw/simsun.ttc
  25. 301 0
      format_convert/easyofd/easyofd/ofd.py
  26. 37 0
      format_convert/easyofd/easyofd/parser_ofd/__init__.py
  27. 145 0
      format_convert/easyofd/easyofd/parser_ofd/file_annotation_parser.py
  28. 7 0
      format_convert/easyofd/easyofd/parser_ofd/file_attachment_parser.py
  29. 140 0
      format_convert/easyofd/easyofd/parser_ofd/file_content_parser.py
  30. 7 0
      format_convert/easyofd/easyofd/parser_ofd/file_customtag_parser.py
  31. 104 0
      format_convert/easyofd/easyofd/parser_ofd/file_deal.py
  32. 99 0
      format_convert/easyofd/easyofd/parser_ofd/file_doc_parser.py
  33. 36 0
      format_convert/easyofd/easyofd/parser_ofd/file_docres_parser.py
  34. 41 0
      format_convert/easyofd/easyofd/parser_ofd/file_ofd_parser.py
  35. 58 0
      format_convert/easyofd/easyofd/parser_ofd/file_parser.py
  36. 63 0
      format_convert/easyofd/easyofd/parser_ofd/file_parser_base.py
  37. 52 0
      format_convert/easyofd/easyofd/parser_ofd/file_publicres_parser.py
  38. 63 0
      format_convert/easyofd/easyofd/parser_ofd/file_signature_parser.py
  39. 100 0
      format_convert/easyofd/easyofd/parser_ofd/find_seal_img.py
  40. 35 0
      format_convert/easyofd/easyofd/parser_ofd/img_deal.py
  41. 607 0
      format_convert/easyofd/easyofd/parser_ofd/ofd_parser.py
  42. 31 0
      format_convert/easyofd/easyofd/parser_ofd/parameter_parser.py
  43. 61 0
      format_convert/easyofd/easyofd/parser_ofd/path_parser.py
  44. 7 0
      format_convert/easyofd/easyofd/template_ofd/__init__.py
  45. 53 0
      format_convert/font_map/extend_to_normal_dict.txt
  46. 214 0
      format_convert/font_map/kangxi_to_normal
  47. 154 0
      format_convert/font_map/kangxi_to_normal_dict.txt
  48. 327 0
      format_convert/ofd/ofd_parser.py
  49. 320 12
      format_convert/utils.py
  50. 3 0
      monitor/watch_10_minutes_process.sh
  51. 8 26
      ocr/ocr_interface.py
  52. 6 2
      ocr/ppocr/data/__init__.py
  53. 39 0
      ocr/test_lock.py
  54. 36 52
      ocr/tools/infer/predict_det_pytorch.py
  55. 188 173
      ocr/tools/infer/predict_rec_pytorch.py
  56. 106 45
      ocr/tools/infer/predict_system.py
  57. 1 0
      start_and_stop/kill_convert.sh
  58. 256 30
      tika_/tika_interface.py

+ 2452 - 16
botr/extract_table.py

@@ -1,29 +1,37 @@
+import copy
+import math
+import os
 import re
 import re
 import time
 import time
 import traceback
 import traceback
+from glob import glob
+import numpy as np
 import cv2
 import cv2
+import wcwidth
 from pdfminer.layout import LTLine
 from pdfminer.layout import LTLine
 # from botr.nsp.predict import nsp_predict
 # from botr.nsp.predict import nsp_predict
+from sklearn.cluster import KMeans
+
 from botr.rules.get_table_by_rules import get_table_by_rule
 from botr.rules.get_table_by_rules import get_table_by_rule
 from botr.utils import line_iou, get_table_iou
 from botr.utils import line_iou, get_table_iou
 from format_convert.convert_need_interface import from_yolo_interface
 from format_convert.convert_need_interface import from_yolo_interface
-from format_convert.utils import log, np2bytes
+from format_convert.utils import log, np2bytes, text_bbox_to_lt, pil_resize, memory_decorator
 
 
 
 
 def b_table_process(list_line, list_text_boxes, list_cell, table_location):
 def b_table_process(list_line, list_text_boxes, list_cell, table_location):
     def merge_textbox(textbox_list, in_objs):
     def merge_textbox(textbox_list, in_objs):
         delete_obj = []
         delete_obj = []
         threshold = 5
         threshold = 5
-        textbox_list.sort(key=lambda x:x.bbox[0])
+        textbox_list.sort(key=lambda x: x.bbox[0])
         for k in range(len(textbox_list)):
         for k in range(len(textbox_list)):
             tb1 = textbox_list[k]
             tb1 = textbox_list[k]
             if tb1 not in in_objs and tb1 not in delete_obj:
             if tb1 not in in_objs and tb1 not in delete_obj:
-                for m in range(k+1, len(textbox_list)):
+                for m in range(k + 1, len(textbox_list)):
                     tb2 = textbox_list[m]
                     tb2 = textbox_list[m]
                     if tb2 in in_objs:
                     if tb2 in in_objs:
                         continue
                         continue
-                    if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
-                            and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
+                    if abs(tb1.bbox[1] - tb2.bbox[1]) <= threshold \
+                            and abs(tb1.bbox[3] - tb2.bbox[3]) <= threshold:
                         if tb1.bbox[0] <= tb2.bbox[0]:
                         if tb1.bbox[0] <= tb2.bbox[0]:
                             tb1.text = tb1.text + tb2.text
                             tb1.text = tb1.text + tb2.text
                         else:
                         else:
@@ -35,6 +43,7 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
             if _obj in textbox_list:
             if _obj in textbox_list:
                 textbox_list.remove(_obj)
                 textbox_list.remove(_obj)
         return textbox_list
         return textbox_list
+
     try:
     try:
         if list_line:
         if list_line:
             from format_convert.convert_tree import TableLine
             from format_convert.convert_tree import TableLine
@@ -55,7 +64,7 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
             current_y = area_list_text_boxes[0].bbox[1]
             current_y = area_list_text_boxes[0].bbox[1]
             current_y2 = area_list_text_boxes[0].bbox[3]
             current_y2 = area_list_text_boxes[0].bbox[3]
             # threshold = 2.
             # threshold = 2.
-            threshold = max(2., 1/3 * abs(current_y2 - current_y))
+            threshold = max(2., 1 / 3 * abs(current_y2 - current_y))
             for t_b in area_list_text_boxes:
             for t_b in area_list_text_boxes:
                 bbox = t_b.bbox
                 bbox = t_b.bbox
                 if current_y - threshold <= bbox[1] <= current_y + threshold:
                 if current_y - threshold <= bbox[1] <= current_y + threshold:
@@ -69,6 +78,11 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
             obj_in_table = []
             obj_in_table = []
             table_dict = {'bbox': table_location}
             table_dict = {'bbox': table_location}
             row_list = []
             row_list = []
+
+            # yolo检测出的表格,忽略两列的,因为已经补充了两列的新规则 250529
+            if list_cell and len(list_cell[0]) == 2:
+                return list_text_boxes, [], set()
+
             for row in list_cell:
             for row in list_cell:
                 col_list = []
                 col_list = []
                 for col in row:
                 for col in row:
@@ -112,17 +126,19 @@ def get_text_box_obj(_text_list, _bbox_list):
     return _text_box_list
     return _text_box_list
 
 
 
 
-def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
+def get_table(img, table_list, text_list, bbox_list, text_box_list, from_pdf=False, show=0):
     log('start')
     log('start')
     # 检测无边框表格
     # 检测无边框表格
     start_time_all = time.time()
     start_time_all = time.time()
     start_time = time.time()
     start_time = time.time()
     img_bytes = np2bytes(img)
     img_bytes = np2bytes(img)
     b_table_list = from_yolo_interface(img_bytes)
     b_table_list = from_yolo_interface(img_bytes)
-    log('yolo detect cost: ' + str(time.time()-start_time))
+    log('yolo detect cost: ' + str(time.time() - start_time))
     b_table_list = b_table_list[0]
     b_table_list = b_table_list[0]
     if not b_table_list:
     if not b_table_list:
         log('detect not b_table_list')
         log('detect not b_table_list')
+        if from_pdf:
+            save_b_table(img)
         return [], [], []
         return [], [], []
 
 
     # if show:
     # if show:
@@ -156,8 +172,9 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
         b_loc = [min_x, min_y, max_x, max_y, b_table[4]]
         b_loc = [min_x, min_y, max_x, max_y, b_table[4]]
         inter_flag = False
         inter_flag = False
         for table in table_list:
         for table in table_list:
-            loc = table.get('bbox')
-            rows = table.get('table')
+            # loc = table.get('bbox')
+            loc = table.bbox
+            # rows = table.get('table')
             iou = line_iou([[0, loc[1]], [0, loc[3]]], [[0, b_loc[1]], [0, b_loc[3]]], axis=1)
             iou = line_iou([[0, loc[1]], [0, loc[3]]], [[0, b_loc[1]], [0, b_loc[3]]], axis=1)
             if iou > 0.3:
             if iou > 0.3:
                 # if len(rows) <= 1:
                 # if len(rows) <= 1:
@@ -190,7 +207,7 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
             if b_loc1 in used_b_loc:
             if b_loc1 in used_b_loc:
                 continue
                 continue
             inter_flag = False
             inter_flag = False
-            for j in range(i+1, len(b_table_location_list)):
+            for j in range(i + 1, len(b_table_location_list)):
                 b_loc2 = b_table_location_list[j]
                 b_loc2 = b_table_location_list[j]
                 iou = line_iou([[0, b_loc1[1]], [0, b_loc1[3]]], [[0, b_loc2[1]], [0, b_loc2[3]]], axis=1)
                 iou = line_iou([[0, b_loc1[1]], [0, b_loc1[3]]], [[0, b_loc2[1]], [0, b_loc2[3]]], axis=1)
                 if show:
                 if show:
@@ -230,7 +247,8 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
 
 
         # 根据ocr bbox,规则生成表格线
         # 根据ocr bbox,规则生成表格线
         start_time = time.time()
         start_time = time.time()
-        line_list, cell_list, table_location, bbox_text_dict = get_table_by_rule(img, area_text_list, area_bbox_list, b_loc, show=show)
+        line_list, cell_list, table_location, bbox_text_dict = get_table_by_rule(img, area_text_list, area_bbox_list,
+                                                                                 b_loc, show=show)
         if not table_location:
         if not table_location:
             log('get_table_by_rule not table_location')
             log('get_table_by_rule not table_location')
             continue
             continue
@@ -240,14 +258,15 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
             area_bbox_list.append(eval(key))
             area_bbox_list.append(eval(key))
             area_text_list.append(bbox_text_dict.get(key))
             area_text_list.append(bbox_text_dict.get(key))
         b_text_box_list = get_text_box_obj(area_text_list, area_bbox_list)
         b_text_box_list = get_text_box_obj(area_text_list, area_bbox_list)
-        log('get_table_by_rule cost: ' + str(time.time()-start_time))
+        log('get_table_by_rule cost: ' + str(time.time() - start_time))
 
 
         # 根据表格线生成单元格
         # 根据表格线生成单元格
         start_time = time.time()
         start_time = time.time()
-        b_text_box_list, _table_list, _obj_in_table_list = b_table_process(line_list, b_text_box_list, cell_list, table_location)
+        b_text_box_list, _table_list, _obj_in_table_list = b_table_process(line_list, b_text_box_list, cell_list,
+                                                                           table_location)
         table_list += _table_list
         table_list += _table_list
         obj_in_table_list += _obj_in_table_list
         obj_in_table_list += _obj_in_table_list
-        log('b_table_process cost: ' + str(time.time()-start_time))
+        log('b_table_process cost: ' + str(time.time() - start_time))
 
 
         # if not table_list:
         # if not table_list:
         #     log('table_process not table_list')
         #     log('table_process not table_list')
@@ -317,4 +336,2421 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
         # _table_list[0]['table'] = new_table
         # _table_list[0]['table'] = new_table
 
 
     log('get_table finish ' + str(time.time() - start_time_all))
     log('get_table finish ' + str(time.time() - start_time_all))
-    return text_box_list, table_list, obj_in_table_list
+    return text_box_list, table_list, obj_in_table_list
+
+
+def save_b_table(image_np):
+    _start_time = time.time()
+    _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table_not_detect'
+    # _path = 'D:/Project/format_conversion_maxcompute/save_b_table_not_detect'
+    max_index = 20000
+    if os.path.exists(_path):
+        file_list = glob(_path + '/*')
+        if file_list:
+            file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
+            file_index_list.sort(key=lambda x: x)
+            index = file_index_list[-1] + 1
+        else:
+            index = 0
+        if index > max_index:
+            return
+
+        # 文件md5
+        from format_convert import _global
+        _md5 = _global.get("md5")
+
+        _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png'
+        cv2.imwrite(_image_path, image_np)
+        log('save yolo not detect b_table image success!')
+
+
+@memory_decorator
+def get_b_table_by_blank_colon(lt_text_list, table_list, layout_bbox, image_np=None, show=0):
+    start_time = time.time()
+
+    # print('len(lt_text_list)', len(lt_text_list))
+    # for lt_text in lt_text_list:
+    #     print('lt_text', lt_text)
+
+    # 新增冒号提前判断
+    colon_cnt = 0
+    for lt_text in lt_text_list:
+        if re.search('[::]', lt_text.get_text()):
+            colon_cnt += 1
+    if colon_cnt <= 6:
+        log('pre judge colon_cnt <= 6')
+        return [], []
+
+    # 图片类型,限制lt_text_list个数,并且很多是单字的
+    if image_np is not None and len(lt_text_list) >= 60:
+        single_char_cnt = 0
+        for lt_text in lt_text_list:
+            if len(lt_text.get_text()) <= 1:
+                single_char_cnt += 1
+        # log('len(lt_text_list), single_char_cnt ' + str(len(lt_text_list)) + ' ' + str(single_char_cnt))
+        if single_char_cnt > 50 or single_char_cnt > 1/3 * len(lt_text_list):
+            return [], []
+
+    # raise
+    # 有些确定为非表格,也输出,防止后续YOLO判断为表格,搞乱数据
+    not_b_table_list = []
+
+    layout_h = int(layout_bbox[3])
+    layout_w = int(layout_bbox[2])
+
+    if show:
+        print('layout_w, layout_h', layout_w, layout_h)
+        show_image = np.full((layout_h, layout_w, 3), 255, dtype=np.uint8)
+
+    if show and image_np is not None:
+        image_np_show = copy.copy(image_np)
+        for lt_text in lt_text_list:
+            bbox = [int(x) for x in lt_text.bbox]
+            cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
+        cv2.imshow('image origin', image_np_show)
+        cv2.waitKey(0)
+
+    # pdf类型预处理
+    start_time1 = time.time()
+    if image_np is None:
+        # 把单个lt_text中,中间多个空格分割的分开
+        lt_text_list = split_lt_text_by_many_space(lt_text_list)
+
+        if show:
+            for lt_text in lt_text_list:
+                bbox = [int(x) for x in lt_text.bbox]
+                cv2.rectangle(show_image, bbox[:2], bbox[2:4], (0, 0, 255))
+            cv2.imshow('pdf preprocess', show_image)
+            cv2.waitKey(0)
+        # log('get_b_table_by_blank_colon pdf preprocess cost: ' + str(time.time()-start_time1))
+
+    # 图片类型预处理
+    start_time1 = time.time()
+    if image_np is not None:
+        # 删除空的
+        start_time2 = time.time()
+        lt_text_list = delete_empty_bbox(lt_text_list)
+        # print('delete_empty_bbox cost: ', time.time()-start_time2)
+
+        # ocr识别的文本框需处理后紧贴文本,才能依靠空白分行
+        start_time2 = time.time()
+        new_bbox_list = shrink_bbox(image_np, [x.bbox for x in lt_text_list])
+        # print('shrink_bbox cost: ', time.time()-start_time2)
+        start_time2 = time.time()
+        for i, lt_text in enumerate(lt_text_list):
+            lt_text.bbox = new_bbox_list[i]
+        # print('lt_text.bbox = new_bbox_list[i] cost: ', time.time()-start_time2)
+        # log('get_b_table_by_blank_colon image preprocess1 cost: ' + str(time.time()-start_time1))
+
+    # 计算单字平均距离
+    start_time1 = time.time()
+    all_char_cnt = 0
+    all_text_width = 0
+    for lt_text in lt_text_list:
+        all_char_cnt += len(lt_text.get_text())
+        all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
+    if all_char_cnt == 0:
+        return [], not_b_table_list
+    avg_char_width = all_text_width / all_char_cnt
+
+    # 图片类型预处理2
+    if image_np is not None:
+        # ocr识别的表格的值可能因空格分开,合并
+        lt_text_list = merge_same_bbox(lt_text_list, avg_char_width)
+
+        # bbox交叉,修复
+        lt_text_list = fix_cross_bbox(lt_text_list)
+        # log('get_b_table_by_blank_colon image preprocess2 cost: ' + str(time.time()-start_time1))
+
+    if show and image_np is not None:
+        image_np_show = copy.copy(image_np)
+        for lt_text in lt_text_list:
+            bbox = [int(x) for x in lt_text.bbox]
+            cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
+        cv2.imshow('image preprocess', image_np_show)
+        cv2.waitKey(0)
+
+    if show:
+        for lt_text in lt_text_list:
+            print('lt_text', lt_text)
+
+    # 过滤xy值过大过小的
+    temp_list = []
+    for lt_text in lt_text_list:
+        if min(lt_text.bbox) < 0 or max(lt_text.bbox) > 10000:
+            continue
+        temp_list.append(lt_text)
+    lt_text_list = temp_list
+
+    if show:
+        for lt_text in lt_text_list:
+            cv2.rectangle(show_image,
+                          (int(lt_text.bbox[0]), int(lt_text.bbox[1])),
+                          (int(lt_text.bbox[2]), int(lt_text.bbox[3])),
+                          (0, 0, 255)
+                          )
+        for table in table_list:
+            cv2.rectangle(show_image,
+                          (int(table.bbox[0]), int(table.bbox[1])),
+                          (int(table.bbox[2]), int(table.bbox[3])),
+                          (0, 255, 0)
+                          )
+
+    # 计算单字平均距离
+    all_char_cnt = 0
+    all_text_width = 0
+    for lt_text in lt_text_list:
+        all_char_cnt += len(lt_text.get_text())
+        all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
+    if all_char_cnt == 0:
+        return [], not_b_table_list
+    avg_char_width = all_text_width / all_char_cnt
+    if show:
+        print('avg_char_width', avg_char_width)
+
+    if image_np is None:
+        blank_width = 1 * avg_char_width
+    else:
+        blank_width = 1 * avg_char_width
+    if show:
+        print('blank_width', blank_width)
+
+    # 根据有边框表格位置,将该页分为多个区域
+    table_h_list = []
+    area_h_list = []
+    area_start_h = 0
+    table_list.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3]))
+    for table in table_list:
+        table_h_list.append([table.bbox[1], table.bbox[3]])
+        area_h_list.append([area_start_h, table.bbox[1]])
+        area_start_h = table.bbox[3]
+    area_h_list.append([area_start_h, layout_h])
+
+    if show:
+        for min_h, max_h in area_h_list:
+            print('area_h_list', min_h, max_h)
+            cv2.rectangle(show_image,
+                          (0, int(min_h)),
+                          (layout_w, int(max_h)),
+                          (255, 0, 0)
+                          )
+
+    lt_text_area_list = []
+    for area_min_h, area_max_h in area_h_list:
+        sub_area = []
+        for lt_text in lt_text_list:
+            if area_min_h <= lt_text.bbox[1] <= lt_text.bbox[3] <= area_max_h:
+                sub_area.append(lt_text)
+        lt_text_area_list.append(sub_area)
+    if show:
+        print('len(lt_text_area_list)', len(lt_text_area_list))
+
+    # 每个区域分别进行判断无边框表格
+    result_table_list = []
+    start_time1 = time.time()
+    for sub_lt_text_list in lt_text_area_list:
+        start_time2 = time.time()
+        lt_text_row_list = get_text_row_by_blank(sub_lt_text_list, layout_h)
+        # log('get_text_row_by_blank cost: ' + str(time.time()-start_time2))
+
+        # 有补充的占位lt_text,需添加到lt_text_list
+        for row in lt_text_row_list:
+            for lt_text in row:
+                if lt_text not in lt_text_list:
+                    lt_text_list.append(lt_text)
+
+        if show:
+            for row in lt_text_row_list:
+                print('row', row)
+
+        start_time2 = time.time()
+        b_table_list1, b_table_bbox_list1 = get_b_table_by_lt_text_row(lt_text_row_list)
+        # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))
+
+        # 确定区域后,对表格内重新分行,更精准
+        start_time2 = time.time()
+        table_lt_text_row_list = []
+        for bi, b_table in enumerate(b_table_list1):
+            b_table_bbox = b_table_bbox_list1[bi]
+            sub_lt_text_list = []
+            for lt_text in lt_text_list:
+                if b_table_bbox[1] <= lt_text.bbox[1] <= lt_text.bbox[3] <= b_table_bbox[3]:
+                    sub_lt_text_list.append(lt_text)
+            _lt_text_row_list, center_blank_row = get_text_row_by_center_blank(b_table, sub_lt_text_list, blank_width,
+                                                                               layout_h)
+            table_lt_text_row_list += _lt_text_row_list
+        # log('get_text_row_by_center_blank cost: ' + str(time.time()-start_time2))
+
+        start_time2 = time.time()
+        b_table_list3, b_table_bbox_list3 = get_b_table_by_lt_text_row(table_lt_text_row_list)
+        # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))
+
+        if show:
+            for b_table in b_table_list3:
+                print('b_table3', b_table)
+
+        # 对大致的表格进行列判断,表格内不同列的框不能交叉,可以重合,需有一定空白
+        start_time2 = time.time()
+        b_table_list2 = []
+        for b_table in b_table_list3:
+
+            blank_row_list = get_blank_row(b_table, blank_width)
+            if show:
+                print('b_table get_blank_row b_table_list3', b_table)
+                print('blank_row_list b_table_list3', blank_row_list)
+
+            b_table2 = []
+            for bi, lt_text_row1 in enumerate(b_table[:-1]):
+                lt_text_row2 = b_table[bi + 1]
+                # if row1_row2_has_same_col(lt_text_row1, lt_text_row2):
+                if row1_row2_has_same_blank(blank_row_list[bi], blank_row_list[bi + 1]):
+                    if lt_text_row1 not in b_table2:
+                        b_table2.append(lt_text_row1)
+                    if lt_text_row2 not in b_table2:
+                        b_table2.append(lt_text_row2)
+                else:
+                    # print('not cross blank', blank_row_list[bi], blank_row_list[bi + 1])
+                    if len(b_table2) >= 2:
+                        b_table_list2.append(b_table2)
+                    b_table2 = []
+            if len(b_table2) >= 2:
+                b_table_list2.append(b_table2)
+        # log('get_blank_row cost: ' + str(time.time()-start_time2))
+
+        if show:
+            for b_table2 in b_table_list2:
+                print('b_table2')
+                for lt_text_row in b_table2:
+                    print('b_table2 lt_text_row', lt_text_row)
+
+        start_time2 = time.time()
+        for bi, b_table2 in enumerate(b_table_list2):
+            # 根据冒号得到表格
+            start_time3 = time.time()
+            table2, center_blank_row, _not_b_table_bbox_list, table_bbox \
+                = get_b_table_by_colon(b_table2, blank_width)
+            log('get_b_table_by_colon cost: ' + str(time.time()-start_time3))
+            not_b_table_list += [[[], x] for x in _not_b_table_bbox_list]
+
+            if show and center_blank_row:
+                print('show center_blank_row', center_blank_row)
+                bx = int((center_blank_row[2] + center_blank_row[0]) / 2)
+                by = int((center_blank_row[3] + center_blank_row[1]) / 2)
+                br = int((center_blank_row[2] - center_blank_row[0]) / 2)
+                if br <= 5:
+                    br = 5
+                print('bx, by, br', bx, by, br)
+                cv2.circle(show_image, (bx, by), br, (0, 255, 0))
+
+            if show:
+                min_w, min_h, max_w, max_h = table_bbox
+                cv2.rectangle(show_image,
+                              (int(min_w), int(min_h)),
+                              (int(max_w), int(max_h)),
+                              (0, 255, 0)
+                              )
+
+            # 修复最后一行跨行
+            # table2 = fix_final_row(table2)
+
+            # 表格末尾有些只有一列的需补充
+            table2 = add_last_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)
+
+            table2 = add_first_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)
+
+            # table格式转化
+            table2 = table_list_to_dict(table2)
+
+            # 表格一些标准化,比如去掉占位符
+            table2 = standard_table(table2)
+
+            if table2:
+                result_table_list.append([table2, table_bbox])
+        # log('colon, add, standard cost: ' + str(time.time()-start_time2))
+
+    # log('get_b_table_by_blank_colon area get b_table cost: ' + str(time.time()-start_time1))
+
+    if show:
+        cv2.namedWindow("final result", cv2.WINDOW_NORMAL)
+        cv2.resizeWindow("final result", 768, 1024)
+        cv2.imshow('final result', show_image)
+        cv2.waitKey(0)
+
+    if show:
+        for table in result_table_list:
+            print('get_b_table_by_bbox table ', table)
+
+        for not_table_bbox in not_b_table_list:
+            print('not_table bbox ', not_table_bbox)
+
+    # log('get_b_table_by_blank_colon cost: ' + str(time.time()-start_time))
+    return result_table_list, not_b_table_list
+
+
+def get_b_table_by_lt_text_row(lt_text_row_list, show=0):
+    # 先大致确定区域,列数大于2的区域
+    b_table_list1 = []
+    b_table = []
+
+    for lt_text_row in lt_text_row_list:
+        if len(lt_text_row) >= 2:
+            b_table.append(lt_text_row)
+        else:
+            if len(b_table) >= 2:
+                b_table_list1.append(b_table)
+            b_table = []
+    if len(b_table) >= 2:
+        b_table_list1.append(b_table)
+
+    # 获取bbox
+    b_table_bbox_list = []
+    for b_table in b_table_list1:
+        x1 = min([y.bbox[0] for x in b_table for y in x])
+        y1 = min([y.bbox[1] for x in b_table for y in x])
+        x2 = max([y.bbox[2] for x in b_table for y in x])
+        y2 = max([y.bbox[3] for x in b_table for y in x])
+
+        b_table_bbox_list.append([x1, y1, x2, y2])
+
+    if show:
+        for b_table in b_table_list1:
+            print('b_table')
+            for lt_text_row in b_table:
+                print('b_table lt_text_row', lt_text_row)
+    return b_table_list1, b_table_bbox_list
+
+
+def row1_row2_has_same_col(row1, row2):
+    threshold = 5
+    blank_len = 2
+    cross_flag = 0
+    for lt_text1 in row1:
+        for lt_text2 in row2:
+            if lt_text2.bbox[0] - lt_text1.bbox[2] >= blank_len \
+                    or lt_text1.bbox[0] - lt_text2.bbox[2] >= blank_len \
+                    or lt_text1.bbox[0] - threshold <= lt_text2.bbox[0] < lt_text2.bbox[2] <= lt_text1.bbox[
+                2] + threshold \
+                    or lt_text2.bbox[0] - threshold <= lt_text1.bbox[0] < lt_text1.bbox[2] <= lt_text2.bbox[
+                2] + threshold:
+                pass
+            else:
+                cross_flag = 1
+    if cross_flag:
+        return False
+    else:
+        return True
+
+
+def get_blank_row(lt_text_row_list, blank_min_width, show=0):
+    # 获取空白行
+    blank_row_list = []
+    # blank_min_width = avg_char_width * 3
+    for lt_text_row in lt_text_row_list:
+        lt_text_row.sort(key=lambda x: x.bbox[0])
+        blank_row = []
+        if len(lt_text_row) < 2:
+            blank_row_list.append([])
+        else:
+            # 行内lt_text两两生成空白
+            for lt_text1 in lt_text_row:
+                sub_row = []
+                for lt_text2 in lt_text_row:
+                    if lt_text1 == lt_text2:
+                        continue
+                    # 必须从左到右
+                    if lt_text1.bbox[2] > lt_text2.bbox[0]:
+                        continue
+                    line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
+                    line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
+                    if line_iou(line1, line2) > 0:
+                        continue
+                    sub_row.append([min(lt_text1.bbox[2], lt_text2.bbox[0]),
+                                    min(lt_text1.bbox[3], lt_text2.bbox[1]),
+                                    max(lt_text1.bbox[2], lt_text2.bbox[0]),
+                                    max(lt_text1.bbox[3], lt_text2.bbox[1]),
+                                    ])
+                    if show:
+                        print('sub_row', lt_text1.get_text(), lt_text2.get_text(), sub_row[-1])
+
+                # 每个lt_text只找出其对应的最小的空白
+                if not sub_row:
+                    continue
+                sub_row.sort(key=lambda x: abs(x[0] - x[2]))
+                if show:
+                    print('sub_row[-1]', lt_text1.get_text(), sub_row[-1])
+
+                blank_row.append(sub_row[0])
+
+            # 判断最小距离,一行至少有一段空白大于最小距离
+            match_flag = 0
+            for r in blank_row:
+                if abs(r[2] - r[0]) >= blank_min_width:
+                    match_flag = 1
+                    break
+            if match_flag:
+                blank_row_list.append(blank_row)
+            else:
+                blank_row_list.append([])
+
+    return blank_row_list
+
+
+def row1_row2_has_same_blank(row1, row2):
+    # row1的任一空白,都能和row2的任一空白相交
+    cross_flag = 0
+    for blank1 in row1:
+        if cross_flag == 1:
+            break
+        for blank2 in row2:
+            if blank1[0] <= blank2[0] <= blank1[2] \
+                    or blank1[0] <= blank2[2] <= blank1[2] \
+                    or blank2[0] <= blank1[0] <= blank2[2] \
+                    or blank2[0] <= blank1[2] <= blank2[2]:
+                cross_flag = 1
+                break
+
+    if cross_flag:
+        return True
+    else:
+        return False
+
+
+@memory_decorator
+def get_b_table_by_colon(b_table, blank_width, show=0):
+    # print('into get_b_table_by_colon')
+
+    table_bbox = get_table_bbox(b_table)
+
+    # 有些确定为非表格,也输出,防止后续YOLO判断为表格,搞乱数据
+    not_table_bbox_list = []
+
+    #
+    # row_cnt_list = [len(x) in [2, 3, 4] for x in b_table]
+
+    # 所有行需是2列或4列,同一列算作一列
+    row_cnt_list = []
+    head_cnt_list = []
+    for row in b_table:
+        if not row:
+            continue
+        row.sort(key=lambda x: (x.bbox[0]))
+        col_cnt = 1
+        head_cnt = 0
+        if re.search('[::]', row[0].get_text()):
+            head_cnt += 1
+        for ci, col in enumerate(row):
+            if ci == 0:
+                continue
+            col1 = row[ci - 1]
+            col2 = row[ci]
+            line1 = [(col1.bbox[0], 0), (col1.bbox[2], 0)]
+            line2 = [(col2.bbox[0], 0), (col2.bbox[2], 0)]
+            if line_iou(line1, line2) >= 0.5:
+                continue
+            else:
+                col_cnt += 1
+                if re.search('[::]', col2.get_text()):
+                    head_cnt += 1
+        row_cnt_list.append(col_cnt in [2, 3, 4])
+        head_cnt_list.append(head_cnt)
+
+    if show:
+        print('row_cnt_list', row_cnt_list)
+        print('head_cnt_list', head_cnt_list)
+
+    if max(head_cnt_list) > 2:
+        if show:
+            for row in b_table:
+                print('head_cnt_list row', row)
+        return [], None, not_table_bbox_list, table_bbox
+
+    # 最后一行年月日可能会影响列数,不是234列
+    if row_cnt_list[-1] is False:
+        row_cnt_list = row_cnt_list[:-1]
+        b_table = b_table[:-1]
+        table_bbox = get_table_bbox(b_table)
+
+    row_cnt_list = list(set(row_cnt_list))
+    if not (len(row_cnt_list) == 1 and row_cnt_list[0] is True):
+        return [], None, not_table_bbox_list, table_bbox
+
+    # 至少有2个以上文本包含冒号
+    colon_cnt = 0
+    for lt_text_row in b_table:
+        for lt_text in lt_text_row:
+            if re.search('[::]', lt_text.get_text()) and re.search('[\u4e00-\u9fff]', lt_text.get_text()):
+                colon_cnt += 1
+    if show:
+        print('colon_cnt, len(table)', colon_cnt, len(b_table))
+    # if colon_cnt < 2:
+    if colon_cnt < len(b_table) / 2:
+        return [], None, not_table_bbox_list, table_bbox
+
+    blank_row_list = get_blank_row(b_table, blank_width)
+    if show:
+        print('b_table get_blank_row colon', b_table)
+        print('blank_row_list colon', blank_row_list)
+    # blank_row_list = [y for x in blank_row_list for y in x]
+    # print('blank_row_list2', blank_row_list)
+    # # 先选最长空白包含的所有空白
+    # blank_row_list.sort(key=lambda x: abs(x[0]-x[2]), reverse=True)
+    # max_blank = blank_row_list[0]
+    # if show:
+    #     print('max_blank', max_blank)
+    # if abs(max_blank[0]-max_blank[2]) <= 4 * avg_char_width:
+    #     return []
+    # max_col = []
+    # for blank_row_bbox in blank_row_list:
+    #     if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
+    #         max_col.append(blank_row_bbox)
+    # if show:
+    #     print('max_col', max_col)
+    # if not max_col:
+    #     return []
+    # # 选取被包含最多的空白
+    # blank_contain_cnt_dict = {}
+    # for bi, blank_row_bbox in enumerate(max_col):
+    #     blank_contain_cnt_dict[bi] = 0
+    #     for blank_row_bbox2 in max_col:
+    #         if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
+    #             blank_contain_cnt_dict[bi] += 1
+    # blank_contain_cnt_list = [[k, v] for k, v in blank_contain_cnt_dict.items()]
+    # blank_contain_cnt_list.sort(key=lambda x: x[1])
+    # if show:
+    #     print('blank_contain_cnt_list', blank_contain_cnt_list)
+    # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]
+
+    center_blank_row = choose_center_blank(blank_row_list, blank_width)
+    if show:
+        print('center_blank_row', center_blank_row)
+
+    # 获取中心最短的空白,作为参考
+    # blank_list = [get_blank_row(x) for x in b_table]
+    # blank_list = [x[0] if len(x) == 1 else x[1] for x in blank_list]
+    # blank_list.sort(key=lambda x: abs(x[2] - x[0]))
+    # center_blank = blank_list[0]
+    #
+    # print('center_blank', center_blank)
+
+    # 根据中心空白,分为两列
+    # col_list1 = []
+    # col_list2 = []
+    # col_box_dict = {}
+    # for lt_text_row in b_table:
+    #     lt_text_row.sort(key=lambda x: x.bbox[0])
+    #     # if len(lt_text_row) == 4:
+    #     #     text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
+    #     #     text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
+    #     #     box1 = [
+    #     #         min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
+    #     #         max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
+    #     #         min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
+    #     #         max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
+    #     #     ]
+    #     #     box2 = [
+    #     #         min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
+    #     #         max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
+    #     #         min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
+    #     #         max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
+    #     #     ]
+    #     #
+    #     #     # col_list1.append(text1)
+    #     #     # col_list2.append(text2)
+    #     # else:
+    #     #     text1 = lt_text_row[0].get_text()
+    #     #     text2 = lt_text_row[1].get_text()
+    #     #     box1 = lt_text_row[0].bbox
+    #     #     box2 = lt_text_row[1].bbox
+    #
+    #     left_col = []
+    #     right_col = []
+    #     for lt_text in lt_text_row:
+    #         if lt_text.bbox[2] <= center_blank_row[0]:
+    #             left_col.append(lt_text)
+    #         else:
+    #             right_col.append(lt_text)
+    #
+    #     left_text = [x.get_text() for x in left_col]
+    #     left_text = ''.join(left_text)
+    #     right_text = [x.get_text() for x in right_col]
+    #     right_text = ''.join(right_text)
+    #
+    #     text1 = left_text.strip()
+    #     text2 = right_text.strip()
+    #
+    #     # if text1 in col_box_dict.keys():
+    #     #     col_box_dict[text1] += [box1]
+    #     # else:
+    #     #     col_box_dict[text1] = [box1]
+    #     # if text2 in col_box_dict.keys():
+    #     #     col_box_dict[text2] += [box2]
+    #     # else:
+    #     #     col_box_dict[text2] = [box2]
+    #
+    #     col_list1.append(text1)
+    #     col_list2.append(text2)
+    #
+    # if show:
+    #     print('col_list1', col_list1)
+    #     print('col_list2', col_list2)
+
+    # col_key_value_list1 = []
+    # last_key = ""
+    # for col1 in col_list1:
+    #     match = re.search('[::]+', col1)
+    #     # 有冒号的
+    #     if match:
+    #         key = col1[:match.end()]
+    #         if last_key:
+    #             key = last_key + key
+    #             last_key = ""
+    #         value = col1[match.end():]
+    #         col_key_value_list1.append([key, value])
+    #     # 没有冒号的
+    #     else:
+    #         # 如果该值也存在在col_list2里,则看做表头,和下一行的表头连在一起
+    #         if col1 in col_list2:
+    #             if show:
+    #                 print('col1 in col_list2')
+    #             last_key = col1
+    #         # 不存在,则是上一行的值,和上一行的值连在一起
+    #         else:
+    #             if col_key_value_list1 and re.search('[::]', col_key_value_list1[-1][1]):
+    #                 col_key_value_list1[-1][1] += col1
+    #             else:
+    #                 col_key_value_list1.append(["", col1])
+    #
+    # if show:
+    #     print('col_key_value_list1', col_key_value_list1)
+    #
+    # col_key_value_list2 = []
+    # last_key = ""
+    # for col2 in col_list2:
+    #     match = re.search('[::]+', col2)
+    #     if match:
+    #         key = col2[:match.end()]
+    #         if last_key:
+    #             key = last_key + key
+    #             last_key = ""
+    #         value = col2[match.end():]
+    #         col_key_value_list2.append([key, value])
+    #     else:
+    #         # 如果该值也存在在col_list1里,则看做表头,和下一行的表头连在一起
+    #         if col2 in col_list1:
+    #             if show:
+    #                 print('col2 in col_list1')
+    #             last_key = col2
+    #         # 不存在,则是上一行的值,和上一行的值连在一起
+    #         else:
+    #             if col_key_value_list2 and re.search('[::]', col_key_value_list2[-1][1]):
+    #                 col_key_value_list2[-1][1] += col2
+    #             else:
+    #                 col_key_value_list2.append(["", col2])
+    #
+    # if show:
+    #     print('col_key_value_list2', col_key_value_list2)
+
+    if not center_blank_row:
+        return [], None, not_table_bbox_list, table_bbox
+
+    # 根据中心空白,分为两列
+    col_list1, col_list2 = divide_2_col_by_center_blank(b_table, center_blank_row)
+    # 非表格,一般是那种一行里键值离的较远的单列,加入非表格,后续yolo判断也忽略
+    if not col_list1 and not col_list2:
+        not_table_bbox = get_table_bbox(b_table)
+        not_table_bbox_list.append(not_table_bbox)
+        return [], None, not_table_bbox_list, table_bbox
+
+    # 两列中,分别设置head value
+    col_key_value_list1 = set_head_value_in_col(col_list1, col_list2)
+    col_key_value_list2 = set_head_value_in_col(col_list2, col_list1)
+
+    # 根据两列head value,形成行
+    b_table_row_list = []
+    for i in range(max(len(col_key_value_list1), len(col_key_value_list2))):
+        if i >= len(col_key_value_list1):
+            col1 = ["", ""]
+        else:
+            col1 = col_key_value_list1[i]
+        if i >= len(col_key_value_list2):
+            col2 = ["", ""]
+        else:
+            col2 = col_key_value_list2[i]
+
+        row = col1[:2] + col2[:2]
+        b_table_row_list.append(row)
+
+    # 删除空白列
+    # col_dict = {}
+    # for row in b_table_row_list:
+    #     for col_i, col in enumerate(row):
+    #         if col_i in col_dict.keys():
+    #             col_dict[col_i] += [col]
+    #         else:
+    #             col_dict[col_i] = [col]
+    # delete_col_i = []
+    # for col_i, cols in col_dict.items():
+    #     cols = list(set(cols))
+    #     if len(cols) == 1 and cols[0] == '':
+    #         delete_col_i.append(col_i)
+    #
+    # temp_list = []
+    # for row in b_table_row_list:
+    #     new_col = []
+    #     for col_i, col in enumerate(row):
+    #         if col_i in delete_col_i:
+    #             continue
+    #         new_col.append(col)
+    #     temp_list.append(new_col)
+    # b_table_row_list = temp_list
+
+    # 去掉删除空白列
+    # b_table_row_list = delete_blank_col(b_table_row_list)
+
+    # 修复因表头和值是同一列上下排列,导致的错位
+    b_table_row_list = fix_head_value_match(b_table_row_list)
+
+    if show:
+        print('b_table_row_list', b_table_row_list)
+    return b_table_row_list, center_blank_row, not_table_bbox_list, table_bbox
+
+
+@memory_decorator
+def get_text_row_by_blank(lt_text_list, layout_h, show=0):
+    if show:
+        for lt_text_row in lt_text_list:
+            print('lt_text_111', lt_text_row)
+    lt_text_blank_list = get_up_down_blank(lt_text_list)
+    lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
+    if show:
+        for lt_text_row in lt_text_row_list:
+            print('lt_text_row', lt_text_row)
+
+    return lt_text_row_list
+
+
+def get_text_row_by_center_blank(b_table, lt_text_list, blank_width, layout_h, show=0):
+    # 获取行空白
+    blank_row_list = get_blank_row(b_table, blank_width)
+    if show:
+        print('b_table get_blank_row center_blank', b_table)
+        print('blank_row_list center_blank', blank_row_list)
+
+    # 获取中心空白
+    center_blank_row = choose_center_blank(blank_row_list, blank_width)
+    if show:
+        print('center_blank_row center', center_blank_row)
+    if not center_blank_row:
+        return [], []
+
+    center_x = (center_blank_row[2] + center_blank_row[0]) / 2
+
+    lt_text_blank_list = get_up_down_blank(lt_text_list, center_x=center_x)
+
+    lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
+
+    if show:
+        for lt_text_row in lt_text_row_list:
+            print('lt_text_row center', lt_text_row)
+
+    return lt_text_row_list, center_blank_row
+
+
+def table_list_to_dict(table):
+    table_dict_list = []
+    for row in table:
+        new_row = []
+        for col in row:
+            col_dict = {
+                'rowspan': 1,
+                'columnspan': 1,
+                'text': col
+            }
+            new_row.append(col_dict)
+        table_dict_list.append(new_row)
+    return table_dict_list
+
+
+@memory_decorator
+def get_up_down_blank(lt_text_list, center_x=None, show=0):
+    # 根据文本上下的空白分行
+    lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
+    lt_text_blank_list = []
+    for i in range(len(lt_text_list)):
+        lt_text1 = lt_text_list[i]
+        line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
+        if center_x is not None:
+            left_or_right1 = 0 if (lt_text1.bbox[0] + lt_text1.bbox[2]) / 2 <= center_x else 1
+
+        up_blank_list = []
+        down_blank_list = []
+        for j in range(len(lt_text_list)):
+            lt_text2 = lt_text_list[j]
+            if lt_text1 == lt_text2:
+                continue
+
+            # 没有中间列分割
+            if center_x is None:
+                line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
+                iou = line_iou(line1, line2)
+                if lt_text2.bbox[1] > lt_text1.bbox[3] and iou > 0:
+                    down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
+                if lt_text2.bbox[3] < lt_text1.bbox[1] and iou > 0:
+                    up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
+                # if lt_text1.bbox[1] > lt_text2.bbox[3] and iou > 0:
+                #     down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
+                # if lt_text1.bbox[3] < lt_text2.bbox[1] and iou > 0:
+                #     up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
+            # 有中间列分割
+            else:
+                left_or_right2 = 0 if (lt_text2.bbox[0] + lt_text2.bbox[2]) / 2 <= center_x else 1
+                if lt_text2.bbox[1] > lt_text1.bbox[3] and left_or_right1 == left_or_right2:
+                    down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
+                if lt_text2.bbox[3] < lt_text1.bbox[1] and left_or_right1 == left_or_right2:
+                    up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
+                # if lt_text1.bbox[1] > lt_text2.bbox[3] and left_or_right1 == left_or_right2:
+                #     down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
+                # if lt_text1.bbox[3] < lt_text2.bbox[1] and left_or_right1 == left_or_right2:
+                #     up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
+
+        # 找不到的,空白设置为自身text高度
+        text_h = abs(lt_text1.bbox[3] - lt_text1.bbox[1])
+        if not up_blank_list:
+            up_blank_list.append([max(0, lt_text1.bbox[1] - text_h), lt_text1.bbox[1]])
+        if not down_blank_list:
+            down_blank_list.append([lt_text1.bbox[3], lt_text1.bbox[3] + text_h])
+
+        down_blank = down_blank_list[0]
+        up_blank = up_blank_list[-1]
+
+        if show:
+            print('lt_text1.get_text()', lt_text1.get_text(), lt_text1.bbox)
+            if center_x is not None:
+                print('center_x', center_x)
+            print('up_blank', up_blank)
+            print('down_blank', down_blank)
+
+        lt_text_blank_list.append([lt_text1, up_blank, down_blank])
+    return lt_text_blank_list
+
+
+@memory_decorator
+def filter_large_blank_row(lt_text_blank_list, layout_h, show=0):
+    # 先过滤空白过大的,单独成行
+    lt_text_row_list = []
+    single_lt_text_list = []
+    max_blank_h = layout_h / 6
+    index = 0
+    threshold = 20
+    lt_text_blank_list.sort(key=lambda x: (x[0].bbox[1], x[0].bbox[0]))
+    for lt_text1, up_blank1, down_blank1 in lt_text_blank_list:
+        row = []
+        # 空白高度大于一定值,单独一行
+        match_flag = 0
+        # 在最下方的lt_text,判断上空白
+        if index >= len(lt_text_blank_list) - 4 \
+                and abs(up_blank1[0] - up_blank1[1]) >= max_blank_h:
+            if show:
+                print('match single lt_text 1')
+            match_flag = 1
+        # 在最上方的lt_text,判断下空白
+        elif index <= 2 \
+                and abs(down_blank1[0] - down_blank1[1]) >= max_blank_h:
+            if show:
+                print('match single lt_text 2')
+            match_flag = 1
+        # 在中间的,上下一起判断
+        elif 2 <= index <= len(lt_text_blank_list) - 4 \
+                and abs(up_blank1[0] - down_blank1[1]) >= max_blank_h:
+            # 判断没有同行的
+            has_same_row_flag = 0
+            for lt_text2, _, _ in lt_text_blank_list:
+                if lt_text1 == lt_text2:
+                    continue
+                if lt_text1.bbox[1] - threshold <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= lt_text1.bbox[3] + threshold:
+                    has_same_row_flag = 1
+                    break
+            if has_same_row_flag:
+                match_flag = 0
+            else:
+                match_flag = 1
+            if show:
+                print('match single lt_text 3')
+
+        if match_flag:
+            row.append(lt_text1)
+            lt_text_row_list.append(row)
+            single_lt_text_list.append(lt_text1)
+        index += 1
+
+    if show:
+        print('single_lt_text_list', single_lt_text_list)
+    return lt_text_row_list, single_lt_text_list
+
+
+@memory_decorator
+def get_contain_blank_row(lt_text_blank_list, layout_h, show=0):
+    from format_convert.convert_tree import TextBox
+    lt_text_row_list, single_lt_text_list = filter_large_blank_row(lt_text_blank_list, layout_h)
+    single_lt_text_list = set(single_lt_text_list)
+
+    # 空白互相包含的就是同一行
+    time1 = time.time()
+    threshold = 5
+    used_lt_text_list = set([])
+    another_used_lt_text_list = set([])
+    for i1 in range(len(lt_text_blank_list)):
+        time2 = time.time()
+        lt_text1, up_blank1, down_blank1 = lt_text_blank_list[i1]
+        row = []
+        if lt_text1 in single_lt_text_list:
+            continue
+        for i2 in range(len(lt_text_blank_list)):
+            lt_text2, up_blank2, down_blank2 = lt_text_blank_list[i2]
+            if lt_text1 == lt_text2:
+                continue
+            if lt_text2 in another_used_lt_text_list:
+                continue
+            if lt_text2 in used_lt_text_list and lt_text1.bbox[1] >= lt_text2.bbox[3]:
+                continue
+            if lt_text2 in single_lt_text_list:
+                continue
+
+            # 单独上空白包含上空白,下空白包含下空白
+            if (up_blank1[0] - threshold <= up_blank2[0] <= up_blank2[1] <= up_blank1[1] + threshold) \
+                    or (down_blank1[0] - threshold <= down_blank2[0] <= down_blank2[1] <= down_blank1[1] + threshold):
+                    # or (up_blank2[0] - threshold <= up_blank1[0] <= up_blank1[1] <= up_blank2[1] + threshold) \
+                    # or (down_blank2[0] - threshold <= down_blank1[0] <= down_blank1[1] <= down_blank2[1] + threshold):
+                if lt_text2 not in row:
+                    row.append(lt_text2)
+                    used_lt_text_list.add(lt_text2)
+
+            # 若是上下空白包含了另一个的文本部分,也成立
+            # if up_blank1[0] <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= down_blank1[1]:
+            #     if lt_text2 not in row:
+            #         row.append(lt_text2)
+            #         used_lt_text_list.append(lt_text2)
+
+
+
+        if lt_text1 not in row:
+            row.append(lt_text1)
+
+        if show:
+            print('get_contain_blank_row loop2 cost:', time.time()-time2)
+
+        # 若一个row中有3个带冒号的,说明误把一个单独行合进来了,分开
+        time2 = time.time()
+        colon_cnt = 0
+        colon_lt_text = []
+        for lt in row:
+            if re.search('[::]', lt.get_text()):
+                colon_cnt += 1
+                colon_lt_text.append(lt)
+        if colon_cnt >= 3:
+            if show:
+                print('colon_cnt >= 3 row', row)
+
+            another_lt_text_list = find_outline_lt_text(row)
+
+            # # 把y最大的lt_text单独放一行
+            # colon_lt_text.sort(key=lambda x: x.bbox[1])
+            # # 除了前两个,其他都单放一行
+            # another_lt_text_list = colon_lt_text[2:]
+            for lt_text in another_lt_text_list:
+                if lt_text in row:
+                    row.remove(lt_text)
+                if lt_text in colon_lt_text:
+                    colon_lt_text.remove(lt_text)
+
+            if show:
+                print('another_lt_text_list', another_lt_text_list)
+                print('colon_lt_text', colon_lt_text)
+
+            if not colon_lt_text:
+                continue
+
+            colon_lt_text.sort(key=lambda x: x.bbox[0])
+            lt_text_row_list.append(row)
+            for another_lt_text in another_lt_text_list:
+                if abs(another_lt_text.bbox[0] - colon_lt_text[0].bbox[0]) > abs(
+                        another_lt_text.bbox[0] - colon_lt_text[-1].bbox[0]):
+                    new_bbox = [colon_lt_text[0].bbox[0], another_lt_text.bbox[1],
+                                colon_lt_text[0].bbox[2], another_lt_text.bbox[3]]
+                    another_row = [TextBox(text="@@:", bbox=new_bbox), another_lt_text]
+                else:
+                    new_bbox = [colon_lt_text[-1].bbox[0], another_lt_text.bbox[1],
+                                colon_lt_text[-1].bbox[2], another_lt_text.bbox[3]]
+                    # 新增一列占位
+                    another_row = [another_lt_text, TextBox(text="@@:", bbox=new_bbox)]
+                if show:
+                    print('another_row', another_row)
+                for lt_text3 in another_row:
+                    another_used_lt_text_list.add(lt_text3)
+                lt_text_row_list.append(another_row)
+        else:
+            lt_text_row_list.append(row)
+
+        if show:
+            print('get_contain_blank_row judge colon cost:', time.time()-time2)
+
+    if show:
+        print('get_contain_blank_row double loop cost: ', time.time()-time1)
+
+    # 去重
+    lt_text_row_list.sort(key=lambda x: len(x), reverse=True)
+    if show:
+        for lt_text_row in lt_text_row_list:
+            print('before dedup lt_text_row', lt_text_row)
+
+    lt_text_row_list = merge_intersecting_lists(lt_text_row_list)
+
+    if show:
+        for lt_text_row in lt_text_row_list:
+            print('after dedup lt_text_row', lt_text_row)
+
+    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
+
+    # 剔除全是空白的行
+    temp_list = []
+    for lt_text_row in lt_text_row_list:
+        row_text = ""
+        for lt_text in lt_text_row:
+            row_text += lt_text.get_text()
+        if re.sub('\s+', '', row_text) == "":
+            continue
+        temp_list.append(lt_text_row)
+    lt_text_row_list = temp_list
+    return lt_text_row_list
+
+
+def choose_center_blank(blank_row_list, blank_width, show=0):
+    if not blank_row_list:
+        return []
+
+    # 先选最长空白包含的所有空白
+    blank_list = [y for x in blank_row_list for y in x]
+    if not blank_list:
+        return []
+
+    blank_list.sort(key=lambda x: abs(x[0] - x[2]), reverse=True)
+    max_blank = blank_list[0]
+    if show:
+        print('max_blank', max_blank)
+    if abs(max_blank[0] - max_blank[2]) <= blank_width:
+        return []
+
+    max_col = []
+    for blank_row in blank_row_list:
+        if not blank_row:
+            continue
+
+        # # 找出每一行最大的空白列,但是同一列中则选列中最小的空白
+        # # 空白分列
+        # blank_row.sort(key=lambda x: (x[0], x[1]))
+        # last_blank_bbox = blank_row[0]
+        # blank_col = []
+        # blank_col_list = []
+        # for blank_bbox in blank_row[1:]:
+        #     line1 = ([blank_bbox[0], 0], [blank_bbox[2], 0])
+        #     line2 = ([last_blank_bbox[0], 0], [last_blank_bbox[2], 0])
+        #     if line_iou(line1, line2) >= 0.7:
+        #         blank_col += [blank_bbox, last_blank_bbox]
+        #     else:
+        #         blank_col.sort(key=lambda x: abs(x[2] - x[0]))
+        #         blank_col_list.append(blank_col)
+        #         blank_col = []
+        #     last_blank_bbox = blank_bbox
+
+        # 选最大的列
+        max_blank_bbox = blank_row[0]
+        for blank_bbox in blank_row[1:]:
+            if abs(blank_bbox[0] - blank_bbox[2]) > abs(max_blank_bbox[0] - max_blank_bbox[2]):
+                max_blank_bbox = blank_bbox
+
+        if show:
+            print('max_blank_bbox, blank_row', max_blank_bbox, blank_row)
+
+        line1 = ([max_blank[0], 0], [max_blank[2], 0])
+        line2 = ([max_blank_bbox[0], 0], [max_blank_bbox[2], 0])
+        iou = line_iou(line1, line2)
+        # if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
+        if iou >= 0.5:
+            max_col.append(max_blank_bbox)
+    if show:
+        print('max_col', max_col)
+    if not max_col:
+        return []
+
+    # # 选取被包含最多的空白
+    # # 选取交集最多的空白,相同数量则最短
+    # blank_contain_cnt_dict = {}
+    # for bi, blank_row_bbox in enumerate(max_col):
+    #     blank_contain_cnt_dict[bi] = 0
+    #     for blank_row_bbox2 in max_col:
+    #         line1 = ([blank_row_bbox2[0], 0], [blank_row_bbox2[2], 0])
+    #         line2 = ([blank_row_bbox[0], 0], [blank_row_bbox[2], 0])
+    #         iou = line_iou(line1, line2)
+    #         # if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
+    #         if iou >= 0.2:
+    #             blank_contain_cnt_dict[bi] += 1
+    # blank_contain_cnt_list = [[k, v, abs(max_col[k][2] - max_col[k][0])/2] for k, v in blank_contain_cnt_dict.items()]
+    # blank_contain_cnt_list.sort(key=lambda x: (x[1], -x[2]))
+    # if show:
+    #     print('blank_contain_cnt_list', blank_contain_cnt_list)
+    # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]
+
+    # 选取交集部分
+    center_blank_row = get_inter_part(max_col)
+    return center_blank_row
+
+
+def set_head_value_in_col(col_list1, col_list2, show=0):
+    # 在列中设置 表头和值
+    col_key_value_list = []
+    last_key = ""
+    for col1 in col_list1:
+        match = re.search('[::]+', col1)
+        # 有冒号的
+        if match:
+            key = col1[:match.end()]
+            if last_key:
+                key = last_key + key
+                last_key = ""
+            value = col1[match.end():]
+            col_key_value_list.append([key, value])
+        # 没有冒号的
+        else:
+            # 如果该值也存在在col_list2里,则看做表头,和下一行的表头连在一起
+            if col1 in col_list2:
+                if show:
+                    print('col1 in col_list2')
+                # 若上一行也是无冒号的,直接加入一行
+                if last_key:
+                    col_key_value_list.append(["", last_key])
+                    last_key = ''
+                last_key = col1
+            # 不存在,则是上一行的值,和上一行的值连在一起
+            else:
+                if col_key_value_list and re.search('[::]', col_key_value_list[-1][1]):
+                    col_key_value_list[-1][1] += col1
+                else:
+                    col_key_value_list.append(["", col1])
+
+    # 如果是最后一行没有冒号的,col1 col2都有的,直接当做一行
+    if last_key:
+        col_key_value_list.append(["", last_key])
+
+    if show:
+        print('col_key_value_list', col_key_value_list)
+
+    return col_key_value_list
+
+
+def divide_2_col_by_center_blank(b_table, center_blank_row, show=0):
+    # 根据中心空白,分为两列
+    col_list1 = []
+    col_list2 = []
+    col_box_dict = {}
+    for lt_text_row in b_table:
+        lt_text_row.sort(key=lambda x: x.bbox[0])
+        # if len(lt_text_row) == 4:
+        #     text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
+        #     text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
+        #     box1 = [
+        #         min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
+        #         max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
+        #         min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
+        #         max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
+        #     ]
+        #     box2 = [
+        #         min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
+        #         max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
+        #         min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
+        #         max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
+        #     ]
+        #
+        #     # col_list1.append(text1)
+        #     # col_list2.append(text2)
+        # else:
+        #     text1 = lt_text_row[0].get_text()
+        #     text2 = lt_text_row[1].get_text()
+        #     box1 = lt_text_row[0].bbox
+        #     box2 = lt_text_row[1].bbox
+
+        left_col = []
+        right_col = []
+        for lt_text in lt_text_row:
+            if (lt_text.bbox[2] + lt_text.bbox[0]) / 2 <= abs(center_blank_row[0] + center_blank_row[2]) / 2:
+                left_col.append(lt_text)
+            else:
+                right_col.append(lt_text)
+
+        # 按阅读顺序排序
+        left_col = sort_by_read_order(left_col)
+        left_text = [x.get_text() for x in left_col]
+        left_text = ''.join(left_text)
+        right_col = sort_by_read_order(right_col)
+        right_text = [x.get_text() for x in right_col]
+        right_text = ''.join(right_text)
+
+        text1 = left_text.strip()
+        text2 = right_text.strip()
+
+        col_list1.append(text1)
+        col_list2.append(text2)
+
+    if show:
+        print('col_list1', col_list1)
+        print('col_list2', col_list2)
+
+    # 两列都必须有冒号,否则就是非2列表格
+    colon_cnt1 = 0
+    colon_cnt2 = 0
+    for col in col_list1:
+        if re.search('[::]', col):
+            colon_cnt1 += 1
+    for col in col_list2:
+        if re.search('[::]', col):
+            colon_cnt2 += 1
+
+    if colon_cnt1 < len(col_list1) / 3 or colon_cnt2 < len(col_list2) / 3:
+        col_list1 = []
+        col_list2 = []
+        if show:
+            print('col_list1 colon_cnt1 less', colon_cnt1)
+            print('col_list2 colon_cnt2 less', colon_cnt2)
+
+    return col_list1, col_list2
+
+
+def delete_blank_col(b_table_row_list):
+    # 删除空白列
+    col_dict = {}
+    for row in b_table_row_list:
+        for col_i, col in enumerate(row):
+            if col_i in col_dict.keys():
+                col_dict[col_i] += [col]
+            else:
+                col_dict[col_i] = [col]
+    delete_col_i = []
+    for col_i, cols in col_dict.items():
+        cols = list(set(cols))
+        if len(cols) == 1 and cols[0] == '':
+            delete_col_i.append(col_i)
+
+    temp_list = []
+    for row in b_table_row_list:
+        new_col = []
+        for col_i, col in enumerate(row):
+            if col_i in delete_col_i:
+                continue
+            new_col.append(col)
+        temp_list.append(new_col)
+    b_table_row_list = temp_list
+    return b_table_row_list
+
+
+def fix_head_value_match(b_table, show=0):
+    if not b_table:
+        return b_table
+    if len(b_table[0]) != 4:
+        return b_table
+    maybe_head_index = None
+    match_head_value_dict = {}
+    # 修复值跨行
+    for row_i, row in enumerate(b_table):
+        if maybe_head_index is None:
+            if row[1] in ["", '@@:'] and row[3] in ["", '@@:']:
+                match1 = re.search("[::]", row[0])
+                match2 = re.search("[::]", row[2])
+                if match1 and match2:
+                    maybe_head_index = row_i
+        else:
+            if row[0] in ["", '@@:'] and row[2] in ["", '@@:'] and row[1] not in ["", '@@:'] and row[3] not in ["", '@@:']:
+                if maybe_head_index in match_head_value_dict.keys():
+                    match_head_value_dict[maybe_head_index] += [row_i]
+                else:
+                    match_head_value_dict[maybe_head_index] = [row_i]
+            else:
+                maybe_head_index = None
+
+    if show:
+        print('match_head_value_dict', match_head_value_dict)
+
+    add_row_dict = {}
+    delete_head_index_list = []
+    delete_value_index_list = []
+    for row_index, value_index_list in match_head_value_dict.items():
+        head_row = b_table[row_index]
+        delete_head_index_list.append(row_index)
+        left_value_text = ""
+        right_value_text = ""
+        for value_index in value_index_list:
+            value_row = b_table[value_index]
+            delete_value_index_list.append(value_index)
+            for col in value_row[:2]:
+                left_value_text += col
+            for col in value_row[2:]:
+                right_value_text += col
+        head_row[1] = left_value_text
+        head_row[3] = right_value_text
+        add_row_dict[row_index] = head_row
+
+    # 删掉原来的,加上新的row
+    temp_list = []
+    for row_i, row in enumerate(b_table):
+        if row_i in delete_head_index_list:
+            temp_list.append(add_row_dict.get(row_i))
+            continue
+        if row_i in delete_value_index_list:
+            continue
+        temp_list.append(row)
+    b_table = temp_list
+    return b_table
+
+
+def add_last_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
+                  table_lt_text_row_list, show=0):
+    if not b_table:
+        return b_table
+    if len(b_table[0]) not in [4]:
+        return b_table
+
+    blank_h_list = []
+    max_h_list = []
+    for lt_text_row in table_lt_text_row_list:
+        if not lt_text_row:
+            continue
+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
+        max_h_list.append(max_h)
+    max_h_list.sort(key=lambda x: x)
+    for i in range(1, len(max_h_list)):
+        blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
+    mean_blank_h = np.mean(blank_h_list)
+    if show:
+        print('add_last_rows blank_width_list', blank_h_list)
+        print('add_last_rows mean_blank_h', mean_blank_h)
+
+    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
+    match_row_list = []
+    threshold = 5
+    add_blank_h = mean_blank_h + threshold
+    for li, lt_text_row in enumerate(lt_text_row_list):
+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
+        if show:
+            print('max_h > table_bbox[3]', lt_text_row, max_h, table_bbox[3])
+        # 高度需要在表格y2和y2加上空白的距离间
+        if table_bbox[3] < max_h < table_bbox[3] + add_blank_h:
+            # lt_text x轴上穿过了中心bbox,则跳过
+            if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
+                print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
+                continue
+
+            # 左边需在表格x1和中心x1之间
+            if table_bbox[0] - threshold <= min_w < center_blank_bbox[0]:
+                match_row_list.append([lt_text_row, 0, max_h])
+            # 右边需在表格x2和中心x2之间
+            elif center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3:
+                match_row_list.append([lt_text_row, 1, max_h])
+            else:
+                print('center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3')
+                break
+
+            add_blank_h = add_blank_h + mean_blank_h + threshold
+
+    if show:
+        print('add_last_rows match_row_list', match_row_list)
+
+    add_b_table = []
+    real_max_h = None
+    for mi, match_row in enumerate(match_row_list):
+        lt_text_row, is_right, max_h = match_row
+        lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
+        # 只有一列
+        if len(lt_text_row) == 1:
+            text = lt_text_row[0].get_text()
+            match = re.search('[::]+', text)
+            real_max_h = max_h
+            if not match:
+                head = ""
+                value = text
+            else:
+                head = text[:match.end()]
+                value = text[match.end():]
+        # 或 两列,其实是表头由于空白被隔开
+        elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
+                and lt_text_row[1].get_text()[-1] in [':', ":"]:
+            text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
+            head = text
+            value = ''
+        # 两列
+        elif len(lt_text_row) == 2:
+            text1 = lt_text_row[0].get_text()
+            match = re.search('[::]+', text1)
+            if not match:
+                break
+            real_max_h = max_h
+            head = text1
+            value = lt_text_row[1].get_text()
+        else:
+            if show:
+                print('add_last_rows len(lt_text_row) break', len(lt_text_row))
+            break
+
+        # 获取上一行,可能需要将值补到上一行
+        if mi == 0 or len(add_b_table) == 0:
+            last_row = b_table[-1]
+            last_flag = 0
+        else:
+            last_row = add_b_table[-1]
+            last_flag = 1
+
+        if is_right:
+            if last_row[2] and not last_row[3] and not head and value:
+                b_table[-1][3] = value
+                current_row = ["", "", last_row[2], value]
+            else:
+                current_row = ["", "", head, value]
+        else:
+            if last_row[0] and not last_row[1] and not head and value:
+                current_row = [last_row[0], value, "", ""]
+            else:
+                current_row = [head, value, "", ""]
+
+        # if last_flag == 0:
+        #     b_table = b_table[:-1]
+        add_b_table.append(current_row)
+
+        if show:
+            print('current_row', current_row)
+
+    if show:
+        print('add_b_table', add_b_table)
+
+    b_table += add_b_table
+    if real_max_h is not None:
+        table_bbox[3] = real_max_h
+    return b_table
+
+
+def add_first_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
+                   table_lt_text_row_list, show=0):
+    if not b_table:
+        return b_table
+    if len(b_table[0]) not in [4]:
+        return b_table
+
+    blank_h_list = []
+    max_h_list = []
+    for lt_text_row in table_lt_text_row_list:
+        if not lt_text_row:
+            continue
+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
+        max_h_list.append(max_h)
+    max_h_list.sort(key=lambda x: x)
+    for i in range(1, len(max_h_list)):
+        blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
+    mean_blank_h = np.mean(blank_h_list)
+    if show:
+        print('add_first_rows blank_width_list', blank_h_list)
+        print('add_first_rows mean_blank_h', mean_blank_h)
+
+    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
+    match_row_list = []
+    threshold = 5
+    add_blank_h = mean_blank_h + threshold
+    for li, lt_text_row in enumerate(lt_text_row_list):
+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
+        if show:
+            print('min_h < table_bbox[3]', lt_text_row, min_h, table_bbox[3])
+        # 高度需要有一部分在在表格中
+        if min_h <= table_bbox[1] < max_h:
+            # lt_text x轴上穿过了中心bbox,则跳过
+            if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
+                print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
+                continue
+            # match_row_list.append([lt_text_row, 1, min_h])
+
+            # 中心x1左边
+            if min_w < center_blank_bbox[0]:
+                match_row_list.append([lt_text_row, 0, min_h])
+            # 中心x2右边
+            elif center_blank_bbox[2] < max_w:
+                match_row_list.append([lt_text_row, 1, min_h])
+            else:
+                break
+
+    if show:
+        print('add_first_rows match_row_list', match_row_list)
+
+    real_min_h = None
+    for mi, match_row in enumerate(match_row_list):
+        lt_text_row, is_right, min_h = match_row
+        lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
+        # 只有一列
+        if len(lt_text_row) == 1:
+            text = lt_text_row[0].get_text()
+            match = re.search('[::]+', text)
+            real_min_h = min_h
+            if not match:
+                head = ""
+                value = text
+            else:
+                head = text[:match.end()]
+                value = text[match.end():]
+        # # 或 两列,其实是表头由于空白被隔开
+        # elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
+        #         and lt_text_row[1].get_text()[-1] in [':', ":"]:
+        #     text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
+        #     head = text
+        #     value = ''
+        # # 两列
+        # elif len(lt_text_row) == 2:
+        #     text1 = lt_text_row[0].get_text()
+        #     match = re.search('[::]+', text1)
+        #     if not match:
+        #         break
+        #     real_max_h = max_h
+        #     head = text1
+        #     value = lt_text_row[1].get_text()
+        else:
+            if show:
+                print('add_first_rows len(lt_text_row) break', len(lt_text_row))
+            break
+
+        # 获取表格第一行,可能需要将值补进去
+        if not head and value:
+            if is_right:
+                b_table[0][3] = value + b_table[0][3]
+            else:
+                b_table[0][1] = value + b_table[0][1]
+
+    if real_min_h is not None:
+        table_bbox[1] = real_min_h
+    return b_table
+
+
+def get_row_bbox(row, mode='list'):
+    # 提取所有x1, y1, x2, y2的值
+
+    if mode == 'list':
+        x1_values = [x[0] for x in row]
+        y1_values = [x[1] for x in row]
+        x2_values = [x[2] for x in row]
+        y2_values = [x[3] for x in row]
+    elif mode == '.bbox':
+        x1_values = [x.bbox[0] for x in row]
+        y1_values = [x.bbox[1] for x in row]
+        x2_values = [x.bbox[2] for x in row]
+        y2_values = [x.bbox[3] for x in row]
+
+    min_x = min(x1_values)
+    max_x = max(x2_values)
+    min_y = min(y1_values)
+    max_y = max(y2_values)
+    return min_x, min_y, max_x, max_y
+
+
+def shrink_bbox(img, bbox_list):
+    def return_not_most_color_index(image_np, match_color):
+        # 计算每个像素与背景色的欧几里得距离的平方
+        diff = np.sum(np.sqrt((image_np.astype(np.int32) - match_color.astype(np.int32)) ** 2), axis=2)
+        threshold = 100  # 假设阈值为 10000,可以调整
+        diff_mask = diff > threshold
+        # 获取与背景色相差较大的像素的索引
+        diff_index = np.where(diff_mask)
+        # print('diff_index.size', diff_index[0].size)
+        return diff_index
+
+    def return_not_most_color_index_fast(image_np, match_color):
+        # 将图像和匹配颜色转换为整数类型
+        # image_int = image_np.astype(np.int32)
+        # match_color_int = match_color.astype(np.int32)
+
+        # 计算每个像素与背景色的欧几里得距离的平方
+        diff = np.sum((image_np - match_color) ** 2, axis=2)
+        threshold = 20 # 假设阈值为 10000,可以调整
+        threshold = threshold ** 2
+        diff_mask = diff > threshold
+        # 获取与背景色相差较大的像素的索引
+        diff_index = np.where(diff_mask)
+        # print('diff_index.size', diff_index[0].size)
+        return diff_index
+
+
+    # def count_colors_with_histogram(img):
+    #     time00 = time.time()
+    #
+    #     # 计算每个颜色通道的直方图
+    #     hist_b = cv2.calcHist([img], [0], None, [256], [0, 256])
+    #     hist_g = cv2.calcHist([img], [1], None, [256], [0, 256])
+    #     hist_r = cv2.calcHist([img], [2], None, [256], [0, 256])
+    #
+    #     # 将直方图合并成一个数组
+    #     hist = np.concatenate((hist_b.flatten(), hist_g.flatten(), hist_r.flatten()))
+    #
+    #     # 获取非零值的索引及其数量
+    #     non_zero_indices = np.nonzero(hist)[0]
+    #     counts = hist[non_zero_indices]
+    #
+    #     # 将索引转换为颜色值
+    #     colors = np.unravel_index(non_zero_indices, (256, 256, 256))
+    #     colors = np.transpose(colors)
+    #
+    #     log("count_colors_with_histogram Time taken: " + str(time.time() - time00))
+    #     return colors, counts
+    #
+    #
+    # def count_colors_with_kmeans(img):
+    #     time00 = time.time()
+    #     img_color = img.reshape(-1, 3)
+    #
+    #     # 使用 KMeans 聚类,将颜色聚类为 16 种
+    #     kmeans = KMeans(n_clusters=4, random_state=0, n_init=2, max_iter=10)
+    #     kmeans.fit(img_color)
+    #
+    #     # 获取聚类后的标签和中心
+    #     labels = kmeans.labels_
+    #     centers = kmeans.cluster_centers_
+    #
+    #     # 统计每个聚类中心的数量
+    #     unique_labels, counts = np.unique(labels, return_counts=True)
+    #
+    #     print("Time taken: ", time.time() - time00)
+    #     return centers[unique_labels], counts
+    #
+    # def count_colors_with_bincount(img):
+    #     time00 = time.time()
+    #     img_color = img.reshape(-1, 3)
+    #
+    #     # 将颜色编码为一个整数
+    #     colors_encoded = img_color[:, 0] * 256 * 256 + img_color[:, 1] * 256 + img_color[:, 2]
+    #
+    #     # 使用 bincount 计算每个颜色的数量
+    #     counts = np.bincount(colors_encoded)
+    #
+    #     # 获取非零值的索引及其数量
+    #     non_zero_indices = np.nonzero(counts)[0]
+    #
+    #     # 解码颜色值
+    #     colors_decoded = []
+    #     for index in non_zero_indices:
+    #         r = (index // (256 * 256)) % 256
+    #         g = (index // 256) % 256
+    #         b = index % 256
+    #         colors_decoded.append([r, g, b])
+    #
+    #     colors_decoded = np.array(colors_decoded)
+    #     counts_non_zero = counts[non_zero_indices]
+    #
+    #     print("Time taken: ", time.time() - time00)
+    #     return colors_decoded, counts_non_zero
+
+    # 统计每种颜色的出现次数
+    # time00 = time.time()
+
+    # 对图像进行降采样
+
+    time0 = time.time()
+    down_sample_factor = 8
+    down_sampled_img = img[::down_sample_factor, ::down_sample_factor, :]
+    down_sampled_img_color = down_sampled_img.reshape(-1, 3)
+    colors, counts = np.unique(down_sampled_img_color, return_counts=True, axis=0)
+    log('shrink_bbox 0 ' + str(time.time()-time0))
+
+    # 找到出现次数最多的颜色
+    time0 = time.time()
+    max_count_index = np.argmax(counts)
+    most_frequent_color = colors[max_count_index]
+    most_frequent_color = most_frequent_color.astype(np.int32)
+    log('shrink_bbox 1 ' + str(time.time()-time0))
+
+    new_bbox_list = []
+    img_int = img.astype(np.int32)
+    time0 = time.time()
+    for bbox in bbox_list:
+        # img_bbox = img[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
+        # img_bbox = img[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
+        img_bbox_int = img_int[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
+
+        if 0 in img_bbox_int.shape:
+            new_bbox_list.append(bbox)
+            continue
+
+        # 左右上下开始扫描,碰到黑像素即停
+        # index_list = return_first_black_index(img_bbox[:, :, :])
+        index_list = return_not_most_color_index_fast(img_bbox_int, most_frequent_color)
+
+        if index_list[0].size == 0 or index_list[1].size == 0:
+            new_bbox_list.append(bbox)
+            continue
+        min_h = index_list[0][0]
+        max_h = index_list[0][-1]
+
+        img_bbox1 = np.swapaxes(img_bbox_int, 0, 1)
+        # index_list = return_first_black_index(img_bbox1[:, :, :])
+        index_list = return_not_most_color_index_fast(img_bbox1, most_frequent_color)
+
+        if index_list[0].size == 0 or index_list[1].size == 0:
+            new_bbox_list.append(bbox)
+            continue
+        min_w = index_list[0][0]
+        max_w = index_list[0][-1]
+
+        real_min_w = bbox[0] + min_w
+        real_max_w = bbox[0] + max_w
+        real_min_h = bbox[1] + min_h
+        real_max_h = bbox[1] + max_h
+        new_bbox = [real_min_w, real_min_h, real_max_w, real_max_h]
+        new_bbox_list.append(new_bbox)
+
+        # cv2.imshow('img', img_bbox)
+        # cv2.imshow('shrink', img[int(new_bbox[0][1]):int(new_bbox[2][1]), int(new_bbox[0][0]):int(new_bbox[2][0]), :])
+        # cv2.waitKey(0)
+    log('shrink_bbox 2 ' + str(time.time() - time0))
+    return new_bbox_list
+
+
+def shrink_bbox_by_pixel(lt_text_list):
+    for lt_text in lt_text_list:
+        bbox = lt_text.bbox
+        bbox_h = abs(bbox[3] - bbox[1])
+        shrink_h = bbox_h / 2
+        new_bbox = [bbox[0], int(bbox[1] + shrink_h / 2),
+                    bbox[2], int(bbox[3] - shrink_h / 2)
+                    ]
+        lt_text.bbox = new_bbox
+    return lt_text_list
+
+
+def get_inter_part(bbox_list, show=0):
+    if not bbox_list:
+        return None
+
+    # xs = [[x[0], x[2]] for x in bbox_list]
+    # xs = [y for x in xs for y in x]
+    #
+    # ys = [[x[1], x[3]] for x in bbox_list]
+    # ys = [y for x in ys for y in x]
+    #
+    # xs.sort(key=lambda x: x)
+    # ys.sort(key=lambda x: x)
+    #
+    # max_index = len(bbox_list)
+    # min_index = max_index - 1
+    #
+    # min_x, max_x = xs[min_index], xs[max_index]
+    # min_y, max_y = ys[min_index], ys[max_index]
+
+    # min_x, min_y, max_x, max_y = bbox_list[0]
+    # for bbox in bbox_list:
+    #     # if min_x < bbox[0]:
+    #     #     min_x = bbox[0]
+    #     # if min_y < bbox[1]:
+    #     #     min_y = bbox[1]
+    #     # if max_x > bbox[2]:
+    #     #     max_x = bbox[2]
+    #     # if max_y > bbox[3]:
+    #     #     max_y = bbox[3]
+    #     if min_x < min(bbox[0], bbox[2]):
+    #         min_x = min(bbox[0], bbox[2])
+    #     if min_y < min(bbox[1], bbox[3]):
+    #         min_y = min(bbox[1], bbox[3])
+    #     if max_x > max(bbox[0], bbox[2]):
+    #         max_x = max(bbox[0], bbox[2])
+    #     if max_y > max(bbox[1], bbox[3]):
+    #         max_y = max(bbox[1], bbox[3])
+    #     # print('min_x, min_y, max_x, max_y', min_x, min_y, max_x, max_y)
+    # _min_x = min(min_x, max_x)
+    # _max_x = max(min_x, max_x)
+    # _min_y = min(min_y, max_y)
+    # _max_y = max(min_y, max_y)
+
+    # # 同一行的bbox去重,取最大的
+    # # used_bbox_list = []
+    # current_bbox = bbox_list[0]
+    # delete_bbox_list = []
+    # bbox_list.sort(key=lambda x: (x[1], x[3]))
+    # threshold = 5
+    # for bbox in bbox_list:
+    #     if bbox == current_bbox:
+    #         continue
+    #     if current_bbox in delete_bbox_list:
+    #         current_bbox = bbox
+    #         continue
+    #     if current_bbox[1] - threshold <= bbox[1] <= bbox[3] <= current_bbox[3] + threshold:
+    #         if abs(current_bbox[0] - current_bbox[2]) > abs(bbox[0] - bbox[2]):
+    #             delete_bbox_list.append(bbox)
+    #         else:
+    #             delete_bbox_list.append(current_bbox)
+    #     else:
+    #         current_bbox = bbox
+    #
+    # for bbox in delete_bbox_list:
+    #     if bbox in bbox_list:
+    #         bbox_list.remove(bbox)
+
+    bbox_list.sort(key=lambda x: (x[0], x[2]))
+    min_x, min_y, max_x, max_y = bbox_list[0]
+    for bbox in bbox_list:
+        if min_x < bbox[0]:
+            min_x = bbox[0]
+        if min_y < bbox[1]:
+            min_y = bbox[1]
+        if max_x > bbox[2]:
+            max_x = bbox[2]
+        if max_y > bbox[3]:
+            max_y = bbox[3]
+    _min_x = min(min_x, max_x)
+    _max_x = max(min_x, max_x)
+    _min_y = min(min_y, max_y)
+    _max_y = max(min_y, max_y)
+    if show:
+        print('get_inter_part', [_min_x, _min_y, _max_x, _max_y])
+    return [_min_x, _min_y, _max_x, _max_y]
+
+
+def get_inter_part_250530(bbox_list, show=0):
+    if not bbox_list:
+        return None
+
+    x1_list = [x[0] for x in bbox_list]
+    x2_list = [x[2] for x in bbox_list]
+    y1_list = [x[1] for x in bbox_list]
+    y2_list = [x[3] for x in bbox_list]
+
+    x1_list.sort(key=lambda x: x, reverse=True)
+    x2_list.sort(key=lambda x: x)
+
+
+def get_straight_lines_from_image(image_np, threshold=50):
+    # 读取图像
+    if image_np is None:
+        print("无法读取图像")
+        return False
+
+    # 转换为灰度图像
+    gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
+
+    # 使用Canny算子进行边缘检测
+    edges = cv2.Canny(gray, 20, 150)
+
+    cv2.imshow('edges', edges)
+
+    # 使用霍夫直线变换检测直线
+    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold,
+                            minLineLength=50, maxLineGap=2)
+
+    for line in lines:
+        line = line[0]
+        print('line', line)
+        cv2.line(image_np, line[:2], line[2:], (0, 0, 255))
+
+    cv2.imshow('img', image_np)
+    cv2.waitKey(0)
+
+    print('lines', lines)
+
+
+def get_table_bbox(table):
+    x1 = min([y.bbox[0] for x in table for y in x])
+    y1 = min([y.bbox[1] for x in table for y in x])
+    x2 = max([y.bbox[2] for x in table for y in x])
+    y2 = max([y.bbox[3] for x in table for y in x])
+    return [x1, y1, x2, y2]
+
+
+@memory_decorator
+def merge_intersecting_lists(lists):
+    merged_lists = []
+    for current_list in lists:
+        # 当前列表转换为集合,方便后续操作
+        current_set = set(current_list)
+        merged = False
+        # 遍历已合并的列表,检查是否有交集
+        for i in range(len(merged_lists)):
+            merged_set = set(merged_lists[i])
+            # 如果存在交集
+            if current_set & merged_set:
+                # 合并两个列表,并去重
+                merged_lists[i] = list(merged_set.union(current_set))
+                merged = True
+                break
+        # 如果没有与任何已合并列表交集,则添加为新的合并列表
+        if not merged:
+            merged_lists.append(current_list.copy())
+    return merged_lists
+
+
+def merge_same_bbox(lt_text_list, avg_char_width, show=0):
+    from format_convert.convert_tree import TextBox
+    for i in range(len(lt_text_list)):
+        lt_text1 = lt_text_list[i]
+        line1_x = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
+        line1_y = ((lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0))
+
+        for j in range(i+1, len(lt_text_list)):
+            lt_text2 = lt_text_list[j]
+            # if lt_text1 == lt_text2:
+            #     continue
+            if lt_text1.bbox[2] >= lt_text2.bbox[0]:
+                continue
+
+            # x轴上不相交
+            line2_x = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
+            if line_iou(line1_x, line2_x) > 0:
+                continue
+
+            # y轴上iou大于一定值
+            line2_y = ((lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0))
+            if line_iou(line1_y, line2_y) > 0.9 \
+                    and abs(lt_text1.bbox[2] - lt_text2.bbox[0]) < avg_char_width * 5 \
+                    and re.search('[::]', lt_text2.get_text()) \
+                    and not re.search('[::]', lt_text1.get_text()) \
+                    and len(lt_text1.get_text()) <= 2:
+                new_lt_text = TextBox(text=lt_text1.get_text() + lt_text2.get_text(),
+                                      bbox=[lt_text1.bbox[0], min(lt_text1.bbox[1], lt_text2.bbox[1]),
+                                            lt_text2.bbox[2], max(lt_text1.bbox[3], lt_text2.bbox[3])
+                                            ])
+                lt_text_list[i] = new_lt_text
+                lt_text_list[j] = new_lt_text
+                if show:
+                    print('new_lt_text', new_lt_text)
+
+    lt_text_list = list(set(lt_text_list))
+    lt_text_list.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
+
+    return lt_text_list
+
+
+def sort_by_read_order(lt_text_list, threshold=10):
+    if not lt_text_list:
+        return lt_text_list
+
+    # 按 y1 升序排序
+    lt_text_list.sort(key=lambda x: x.bbox[1])
+
+    # 初始化变量
+    sorted_lt_text_list = []
+    current_row = [lt_text_list[0]]
+
+    for i in range(1, len(lt_text_list)):
+        # 如果当前边界框的 y1 与前一个边界框的 y1 差距小于阈值,认为是同一行
+        if abs(lt_text_list[i].bbox[1] - lt_text_list[i - 1].bbox[1]) < threshold:
+            current_row.append(lt_text_list[i])
+        else:
+            # 对当前行按 x1 排序并添加到结果中
+            current_row.sort(key=lambda x: x.bbox[0])
+            sorted_lt_text_list += current_row
+            current_row = [lt_text_list[i]]
+
+    # 添加最后一行
+    current_row.sort(key=lambda x: x.bbox[0])
+    sorted_lt_text_list += current_row
+    return sorted_lt_text_list
+
+
+def delete_empty_bbox(lt_text_list, show=0):
+    temp_list = []
+    for lt_text in lt_text_list:
+        if lt_text.get_text() in [':', ":", ";", ";"] \
+                or re.sub('\s', '', lt_text.get_text()) == "":
+            continue
+        temp_list.append(lt_text)
+    lt_text_list = temp_list
+    return lt_text_list
+
+
+def standard_table(table, show=0):
+    if not table:
+        return table
+
+    # 去掉占位符
+    for ri, row in enumerate(table):
+        for ci, col in enumerate(row):
+            if '@@:' in col.get('text'):
+                col['text'] = re.sub('@@:', '', col.get('text'))
+
+    # 修复一些表头冒号ocr提取不到被作为值的问题
+    for ri, row in enumerate(table):
+        if row[0].get('text') == '' and row[1].get('text') != '' and row[2].get('text') != '' and row[3].get('text') == '':
+            row[0]['text'] = row[1].get('text')
+            row[1]['text'] = ''
+            if show:
+                print('standard_table, add colon head', table[ri])
+
+    # 修复表头值上下错位的情况
+    # head          head
+    #       value           value
+    delete_row_index_list = []
+    for ri, row in enumerate(table):
+        if ri == 0:
+            continue
+        last_row = table[ri - 1]
+        if last_row[0].get('text') != '' and last_row[1].get('text') == '' \
+                and row[0].get('text') == '' and row[1].get('text') != '' \
+                and last_row[2].get('text') != '' and last_row[3].get('text') == '' \
+                and row[2].get('text') == '' and row[3].get('text') != '':
+            # 补上表头
+            row[0]['text'] = last_row[0].get('text')
+            row[2]['text'] = last_row[2].get('text')
+            delete_row_index_list.append(ri - 1)
+            if show:
+                print('standard_table, fix head value 1', table[ri])
+
+    temp_list = []
+    for ri, row in enumerate(table):
+        if ri in delete_row_index_list:
+            continue
+        temp_list.append(row)
+    table = temp_list
+
+    # 修复值未被合进上一行的情况
+    # head  value   head    value
+    #       value           value
+    delete_row_index_list = []
+    for ri, row in enumerate(table):
+        if ri == 0:
+            continue
+        last_row = table[ri - 1]
+        if last_row[0].get('text') != '' and last_row[1].get('text') != '' \
+                and row[0].get('text') == '' and row[1].get('text') != '' \
+                and last_row[2].get('text') != '' and last_row[3].get('text') != '' \
+                and row[2].get('text') == '' and row[3].get('text') != '':
+            # 补上值
+            last_row[1]['text'] += row[1]['text']
+            last_row[3]['text'] += row[3]['text']
+            delete_row_index_list.append(ri)
+    temp_list = []
+    for ri, row in enumerate(table):
+        if ri in delete_row_index_list:
+            continue
+        temp_list.append(row)
+    table = temp_list
+    return table
+
+
+@memory_decorator
+def find_outline_lt_text(lt_text_list, show=0):
+    lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
+    used_lt_text_list = []
+    row_list = []
+    for lt_text1 in lt_text_list:
+        if lt_text1 in used_lt_text_list:
+            continue
+        row = [lt_text1]
+        used_lt_text_list.append(lt_text1)
+        for lt_text2 in lt_text_list:
+            if lt_text2 in used_lt_text_list:
+                continue
+            line1 = [(lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0)]
+            line2 = [(lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0)]
+            if line_iou(line1, line2) > 0:
+                row.append(lt_text2)
+                used_lt_text_list.append(lt_text2)
+        row_list.append(row)
+
+    outline_lt_text_list = []
+    for row in row_list:
+        if len(row) >= 2:
+            continue
+        outline_lt_text_list += row
+
+    if show:
+        print('outline_lt_text_list', outline_lt_text_list)
+    return outline_lt_text_list
+
+
+def get_iou(bbox1, bbox2):
+    # 提取边界框的坐标
+    x1_1, y1_1, x2_1, y2_1 = bbox1
+    x1_2, y1_2, x2_2, y2_2 = bbox2
+
+    # 判断是否完全包含
+    if (x1_1 <= x1_2 and y1_1 <= y1_2 and x2_1 >= x2_2 and y2_1 >= y2_2) or \
+            (x1_2 <= x1_1 and y1_2 <= y1_1 and x2_2 >= x2_1 and y2_2 >= y2_1):
+        return 1.0
+
+    # 计算交集区域的坐标
+    inter_x1 = max(x1_1, x1_2)
+    inter_y1 = max(y1_1, y1_2)
+    inter_x2 = min(x2_1, x2_2)
+    inter_y2 = min(y2_1, y2_2)
+
+    # 计算交集区域的面积
+    inter_width = max(0, inter_x2 - inter_x1 + 1)
+    inter_height = max(0, inter_y2 - inter_y1 + 1)
+    inter_area = inter_width * inter_height
+
+    # 计算两个边界框的面积
+    bbox1_area = (x2_1 - x1_1 + 1) * (y2_1 - y1_1 + 1)
+    bbox2_area = (x2_2 - x1_2 + 1) * (y2_2 - y1_2 + 1)
+
+    # 计算并集区域的面积
+    union_area = bbox1_area + bbox2_area - inter_area
+
+    # 计算 IoU
+    iou = inter_area / union_area if union_area != 0 else 0
+
+    return iou
+
+
+def fix_cross_bbox(lt_text_list, show=0):
+    for lt_text1 in lt_text_list:
+        for lt_text2 in lt_text_list:
+            if lt_text1 == lt_text2:
+                continue
+            if get_iou(lt_text1.bbox, lt_text2.bbox) > 0:
+                if show:
+                    print('fix_cross_bbox1', lt_text1, lt_text2)
+                x10, x11, x12, x13 = lt_text1.bbox
+                x20, x21, x22, x23 = lt_text2.bbox
+
+                # 右侧相交,且交集不能过大,过大则不是这一维相交
+                if x10 < x20 < x12 and x12 - x20 < max(abs(x12 - x10), abs(x20 - x22)) / 2:
+                    x12 = min(lt_text1.bbox[2], lt_text2.bbox[0])
+                    x20 = max(lt_text1.bbox[2], lt_text2.bbox[0])
+
+                # 下方相交,且交集不能过大,过大则不是这一维相交
+                if x11 < x21 < x13 and x13 - x21 < max(abs(x13 - x11), abs(x21 - x23)) / 2:
+                    x13 = min(lt_text1.bbox[3], lt_text2.bbox[1])
+                    x21 = max(lt_text1.bbox[3], lt_text2.bbox[1])
+
+                lt_text1.bbox = [x10, x11, x12, x13]
+                lt_text2.bbox = [x20, x21, x22, x23]
+                if show:
+                    print('fix_cross_bbox2', lt_text1, lt_text2)
+    return lt_text_list
+
+
+def split_lt_text_by_many_space(lt_text_list, show=0):
+    from format_convert.convert_tree import TextBox
+
+    # 先处理前后空格
+    add_lt_text_list = []
+    delete_lt_text_list = []
+    for lt_text in lt_text_list:
+        text = lt_text.get_text()
+        bbox = lt_text.bbox
+
+        if len(text) == 0:
+            continue
+        text_unicode_len = get_char_unicode_length(text)
+        if text_unicode_len == 0:
+            continue
+        ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
+
+        space1 = re.findall('^[  ]+', text)
+        if space1:
+            space1 = ''.join(space1)
+            space1_unicode_len = get_char_unicode_length(space1)
+            space1_pixel_len = space1_unicode_len * ratio
+            text = re.sub('^[  ]+', '', text)
+            bbox = [bbox[0] + space1_pixel_len, bbox[1], bbox[2], bbox[3]]
+            if len(text) == 0:
+                continue
+            text_unicode_len = get_char_unicode_length(text)
+            if text_unicode_len == 0:
+                continue
+            ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
+
+        space2 = re.findall('[  ]+$', text)
+        if space2:
+            space2 = ''.join(space2)
+            space2_unicode_len = get_char_unicode_length(space2)
+            space2_pixel_len = space2_unicode_len * ratio
+            text = re.sub('[  ]+$', '', text)
+            bbox = [bbox[0], bbox[1], bbox[2] - space2_pixel_len, bbox[3]]
+            if len(text) == 0:
+                continue
+            text_unicode_len = get_char_unicode_length(text)
+            if text_unicode_len == 0:
+                continue
+            ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
+
+        if space1 or space2:
+            new_lt_text = TextBox(text=text, bbox=bbox)
+            add_lt_text_list.append(new_lt_text)
+            delete_lt_text_list.append(lt_text)
+
+    for lt_text in delete_lt_text_list:
+        if lt_text in lt_text_list:
+            lt_text_list.remove(lt_text)
+    lt_text_list += add_lt_text_list
+
+    # 处理表头中间隔着几个空格 电  话:        电  话:
+    add_lt_text_list = []
+    delete_lt_text_list = []
+    for lt_text in lt_text_list:
+        text = lt_text.get_text()
+        bbox = lt_text.bbox
+
+        if len(text) == 0:
+            continue
+
+        space_list = re.findall('[  ]+', text)
+        if len(space_list) >= 2:
+            space_list.sort(key=lambda x: len(x))
+            max_space = space_list[-1]
+            match = re.search(max_space, text)
+            if show:
+                print('max_space', max_space)
+                print('space_list', space_list)
+            if match:
+                part1 = text[:match.start()]
+                part2 = text[match.end():]
+                ss1 = re.split('[  ]+', part1)
+                ss2 = re.split('[  ]+', part2)
+
+                if len(ss1) == 2 and len(ss1[0]) == 1 and len(ss1[1]) == 2 and ss1[1][-1] in [':', ':'] \
+                        and len(ss2) == 2 and len(ss2[0]) == 1 and len(ss2[1]) == 2 and ss2[1][-1] in [':', ':']:
+                    new_text = ''.join(ss1) + max_space + ''.join(ss2)
+                    new_lt_text = TextBox(text=new_text, bbox=bbox)
+                    add_lt_text_list.append(new_lt_text)
+                    delete_lt_text_list.append(lt_text)
+
+    if show:
+        print('split_lt_text_by_many_space add_lt_text_list222', add_lt_text_list)
+        print('split_lt_text_by_many_space delete_lt_text_list222', delete_lt_text_list)
+
+    for lt_text in delete_lt_text_list:
+        if lt_text in lt_text_list:
+            lt_text_list.remove(lt_text)
+    lt_text_list += add_lt_text_list
+
+    # 处理中间多个空格,并拆分为两个
+    add_lt_text_list = []
+    delete_lt_text_list = []
+    for lt_text in lt_text_list:
+        text = lt_text.get_text()
+        bbox = lt_text.bbox
+
+        if len(text) == 0:
+            continue
+
+        text_unicode_len = get_char_unicode_length(text)
+        if text_unicode_len == 0:
+            continue
+        ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
+
+        # 中间有多个空格,且空格分割为两部分
+        match = re.search('[  ]{4,}', text)
+        ss = re.split('[  ]+', text)
+        if match and len(ss) == 2:
+            # if match:
+            part1 = text[:match.start()]
+            part2 = text[match.end():]
+
+            l1 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part1)
+            l2 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part2)
+            # 两边字符数都足够
+            if len(l1) >= 2 and len(l2) >= 2:
+                part1_unicode_len = get_char_unicode_length(part1)
+                part2_unicode_len = get_char_unicode_length(part2)
+
+                part1_pixel_len = ratio * part1_unicode_len
+                part2_pixel_len = ratio * part2_unicode_len
+
+                # avg_char_w = abs(bbox[0] - bbox[2]) / len(text)
+                bbox1 = [bbox[0], bbox[1], bbox[0] + part1_pixel_len, bbox[3]]
+                bbox2 = [bbox[2] - part2_pixel_len, bbox[1], bbox[2], bbox[3]]
+                # 用自己的对象新增
+                new_lt_text1 = TextBox(text=part1, bbox=bbox1)
+                new_lt_text2 = TextBox(text=part2, bbox=bbox2)
+                add_lt_text_list += [new_lt_text1, new_lt_text2]
+                delete_lt_text_list.append(lt_text)
+
+    for lt_text in delete_lt_text_list:
+        if lt_text in lt_text_list:
+            lt_text_list.remove(lt_text)
+    lt_text_list += add_lt_text_list
+
+    if show:
+        print('split_lt_text_by_many_space add_lt_text_list333', add_lt_text_list)
+        print('split_lt_text_by_many_space delete_lt_text_list333', delete_lt_text_list)
+
+    return lt_text_list
+
+
+def get_char_unicode_length(text, show=0):
+    # char_reg_len_dict = {
+    #     '[ ]': 1,
+    #     '[ ]': 1.5,
+    #     '[\u4e00-\u9fff]': 1.5,
+    #     '[a-zA-Z0-9#@,^.+=\(\)<>\-@#$%&*\[\]\'":;?~!’‘“”{}/]': 1,
+    #     '[:,。!¥……()【】;?《》、]': 1.5
+    # }
+    #
+    # text_real_len = 0
+    # for reg, char_len in char_reg_len_dict.items():
+    #     cs = re.findall(reg, text)
+    #     text_real_len += len(cs) * char_len
+    #
+    # real_avg_char_len = abs(bbox[2] - bbox[0]) / text_real_len
+    #
+    # char_reg_real_len_dict = {}
+    # for reg, char_len in char_reg_len_dict.items():
+    #     char_reg_real_len_dict[reg] = real_avg_char_len * char_len
+    #
+    # return char_reg_real_len_dict
+
+    width = wcwidth.wcswidth(text)
+    if show:
+        print('text unicode_length', text, width)
+    return width
+
+
+def fix_final_row(table, show=0):
+    # print('fix_final_row table', table)
+    if len(table) < 2:
+        return table
+    last_row = table[-2]
+    final_row = table[-1]
+    print('final_row', final_row)
+    print('last_row', last_row)
+    delete_final_flag = 0
+    if final_row[0] in ['', '@@:'] and final_row[1] in ['', '@@:'] \
+            and final_row[2] in ['', '@@:'] and final_row[3] not in ['', '@@:']:
+        table[-2][3] = final_row[3]
+        delete_final_flag = 1
+        if show:
+            print('fix_final_row right', table[-2])
+
+    if final_row[0] in ['', '@@:'] and final_row[1] not in ['', '@@:'] \
+            and final_row[2] in ['', '@@:'] and final_row[3] in ['', '@@:']:
+        table[-2][1] = final_row[1]
+        delete_final_flag = 1
+        if show:
+            print('fix_final_row left', table[-2])
+
+    if delete_final_flag:
+        table = table[:-1]
+
+    return table
+
+
+if __name__ == '__main__':
+    # from format_convert.convert_pdf import PDFConvert
+    # pdf_c = PDFConvert(None, None, None)
+    # from format_convert.convert_image import ImageProcess
+    # img_p = ImageProcess(None, None)
+    #
+    # ps = glob(r'D:\Project\format_conversion_maxcompute\save_b_table_not_detect\*')
+    # image_np_list = [[x, cv2.imread(x)] for x in ps]
+    # for p, image_np in image_np_list:
+    #     # 整体分辨率限制
+    #     image_np = img_p.resize_process(image_np)
+    #     # 文字识别
+    #     text_list, box_list = img_p.ocr_process(image_np)
+    #     # 转换为lt_text_box
+    #     _lt_text_list = text_bbox_to_lt(text_list, box_list)
+    # 先bbox预先判断可能有无边框
+    # _flag = judge_has_b_table_by_bbox(_lt_text_list, [], 0)
+    # print('path', p, 'has b table', _flag)
+
+    _pp = r'D:\Project\format_conversion_maxcompute\save_b_table\15-8292f767be81f404b813c119058a8a75.png'
+    img111 = cv2.imread(_pp)
+    img111 = pil_resize(img111, 1024, 768)
+    get_straight_lines_from_image(img111)
+    pass

+ 5 - 0
botr/utils.py

@@ -38,6 +38,11 @@ def request_post(url, param, time_out=1000, use_zlib=False):
 
 
 
 
 def line_iou(line1, line2, axis=0):
 def line_iou(line1, line2, axis=0):
+    if line1[0][axis] <= line2[0][axis] <= line2[1][axis] <= line1[1][axis]:
+        return 1.
+    if line2[0][axis] <= line1[0][axis] <= line1[1][axis] <= line2[1][axis]:
+        return 1.
+
     inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
     inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
     # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
     # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
     union = min(abs(line1[0][axis]-line1[1][axis]), abs(line2[0][axis]-line2[1][axis]))
     union = min(abs(line1[0][axis]-line1[1][axis]), abs(line2[0][axis]-line2[1][axis]))

+ 1 - 1
config/interface_new.yml

@@ -58,7 +58,7 @@
 
 
     "tika": {
     "tika": {
       "port": [ 16020 ],
       "port": [ 16020 ],
-      "port_num": [ 2 ],
+      "port_num": [ 1 ],
       "gpu": [ -1 ]
       "gpu": [ -1 ]
     }
     }
   },
   },

+ 287 - 118
format_convert/convert.py

@@ -1,4 +1,4 @@
-#-*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
 import gc
 import gc
 import json
 import json
 import sys
 import sys
@@ -6,8 +6,20 @@ import os
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 # 强制tf使用cpu
 # 强制tf使用cpu
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+# 动态添加 VERSION 属性到 Image 类
+import PIL
+from PIL import Image
+Image.VERSION = PIL.__version__
+
 from format_convert.utils import judge_error_code, request_post, get_intranet_ip, get_ip_port, get_logger, log, \
 from format_convert.utils import judge_error_code, request_post, get_intranet_ip, get_ip_port, get_logger, log, \
-    set_flask_global, get_md5_from_bytes, memory_decorator
+    set_flask_global, get_md5_from_bytes, memory_decorator, register_all_fonts
+
+# 调用函数注册字体
+# register_all_fonts("/usr/share/fonts/opentype/noto/")
+# register_all_fonts("/usr/share/fonts/truetype/arphic")
+# register_all_fonts("/usr/share/fonts/")
+
 from format_convert.convert_doc import doc2text, DocConvert
 from format_convert.convert_doc import doc2text, DocConvert
 from format_convert.convert_docx import docx2text, DocxConvert
 from format_convert.convert_docx import docx2text, DocxConvert
 from format_convert.convert_image import picture2text, ImageConvert
 from format_convert.convert_image import picture2text, ImageConvert
@@ -18,6 +30,8 @@ from format_convert.convert_txt import txt2text, TxtConvert
 from format_convert.convert_xls import xls2text, XlsConvert
 from format_convert.convert_xls import xls2text, XlsConvert
 from format_convert.convert_xlsx import xlsx2text, XlsxConvert
 from format_convert.convert_xlsx import xlsx2text, XlsxConvert
 from format_convert.convert_zip import zip2text, ZipConvert
 from format_convert.convert_zip import zip2text, ZipConvert
+from format_convert.convert_wps import WpsConvert
+from format_convert.convert_ofd import OfdConvert
 from format_convert.convert_need_interface import from_atc_interface
 from format_convert.convert_need_interface import from_atc_interface
 
 
 import hashlib
 import hashlib
@@ -33,12 +47,28 @@ import logging
 from bs4 import BeautifulSoup
 from bs4 import BeautifulSoup
 from flask import Flask, request, g
 from flask import Flask, request, g
 import inspect
 import inspect
+
 logging.getLogger("pdfminer").setLevel(logging.WARNING)
 logging.getLogger("pdfminer").setLevel(logging.WARNING)
 from format_convert.table_correct import *
 from format_convert.table_correct import *
 from format_convert.wrapt_timeout_decorator import *
 from format_convert.wrapt_timeout_decorator import *
 from format_convert import _global
 from format_convert import _global
 from config.max_compute_config import MAX_COMPUTE
 from config.max_compute_config import MAX_COMPUTE
 
 
+support_file_types = [
+    'txt',
+    'pdf',
+    'doc',
+    'docx',
+    'xls',
+    'xlsx',
+    'zip',
+    'rar',
+    'jpg',
+    'png',
+    'jpeg',
+    'swf',
+    'wps',
+]
 
 
 if get_platform() == "Windows":
 if get_platform() == "Windows":
     globals().update({"time_out": 1000})
     globals().update({"time_out": 1000})
@@ -64,6 +94,9 @@ def getText(_type, path_or_stream, _page_no=None, time_out=300):
     except:
     except:
         unique_type_dir = path_or_stream + "_" + _type + os.sep
         unique_type_dir = path_or_stream + "_" + _type + os.sep
 
 
+    if not os.path.exists(unique_type_dir):
+        os.mkdir(unique_type_dir)
+
     if _type == "pdf":
     if _type == "pdf":
         if MAX_COMPUTE:
         if MAX_COMPUTE:
             return PDFConvert(path_or_stream, unique_type_dir, _page_no).get_html()
             return PDFConvert(path_or_stream, unique_type_dir, _page_no).get_html()
@@ -102,11 +135,19 @@ def getText(_type, path_or_stream, _page_no=None, time_out=300):
         if MAX_COMPUTE:
         if MAX_COMPUTE:
             return TxtConvert(path_or_stream, unique_type_dir).get_html()
             return TxtConvert(path_or_stream, unique_type_dir).get_html()
         return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
         return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
+    if _type == "wps":
+        if MAX_COMPUTE:
+            return WpsConvert(path_or_stream, unique_type_dir).get_html()
+        return get_html_1(WpsConvert(path_or_stream, unique_type_dir))
+    if _type == "ofd":
+        if MAX_COMPUTE:
+            return OfdConvert(path_or_stream, unique_type_dir).get_html()
+        return get_html_1(OfdConvert(path_or_stream, unique_type_dir))
     return [""]
     return [""]
 
 
 
 
 def to_html(path, text):
 def to_html(path, text):
-    with open(path, 'w',encoding="utf8") as f:
+    with open(path, 'w', encoding="utf8") as f:
         f.write("<!DOCTYPE HTML>")
         f.write("<!DOCTYPE HTML>")
         f.write('<head><meta charset="UTF-8"></head>')
         f.write('<head><meta charset="UTF-8"></head>')
         f.write("<body>")
         f.write("<body>")
@@ -154,6 +195,11 @@ def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300, save_m
     if get_platform() == "Windows":
     if get_platform() == "Windows":
         _global._init()
         _global._init()
 
 
+    if MAX_COMPUTE:
+        _path = "/home/admin"
+    else:
+        _path = os.path.dirname(os.path.abspath(__file__))
+
     globals().update({"md5": _md5})
     globals().update({"md5": _md5})
     _global.update({"md5": _md5})
     _global.update({"md5": _md5})
     log("into unique_temp_file_process")
     log("into unique_temp_file_process")
@@ -247,7 +293,7 @@ def cut_str(text_list, only_text_list, max_bytes_length=2000000):
             return only_text_list
             return only_text_list
 
 
         # 截取字符
         # 截取字符
-        all_text = all_text[:int(max_bytes_length/3)]
+        all_text = all_text[:int(max_bytes_length / 3)]
         return [all_text]
         return [all_text]
     except Exception as e:
     except Exception as e:
         log("cut_str " + str(e))
         log("cut_str " + str(e))
@@ -336,7 +382,7 @@ def convert_maxcompute(data, ocr_model, otr_model):
             print({"md5: ": str(_md5), "finished result": ["", 0], "is_success": 1}, time.time() - start_time)
             print({"md5: ": str(_md5), "finished result": ["", 0], "is_success": 1}, time.time() - start_time)
         else:
         else:
             print("md5: " + str(_md5), {"finished result": [str(only_text)[:20], len(str(text))],
             print("md5: " + str(_md5), {"finished result": [str(only_text)[:20], len(str(text))],
-                  "is_success": 1}, time.time() - start_time)
+                                        "is_success": 1}, time.time() - start_time)
         return {"result_html": text, "result_text": only_text, "is_success": 1}
         return {"result_html": text, "result_text": only_text, "is_success": 1}
     except Exception as e:
     except Exception as e:
         print({"md5: ": str(_md5), "failed result": [-1], "is_success": 0}, time.time() - start_time)
         print({"md5: ": str(_md5), "failed result": [-1], "is_success": 0}, time.time() - start_time)
@@ -350,6 +396,20 @@ app = Flask(__name__)
 
 
 @app.route('/convert', methods=['POST'])
 @app.route('/convert', methods=['POST'])
 def _convert():
 def _convert():
+    try:
+        data = request.form
+    except Exception:
+        log_convert_result("1" + "0" * 15, [-1], "", 0,
+                           None, None, time.time())
+        traceback.print_exc()
+        return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
+                           "is_success": 0, "swf_images": str([]),
+                           "classification": ""})
+    result = convert(data)
+    return result
+
+
+def _convert_old_250613():
     """
     """
     接口返回值:
     接口返回值:
     {[str], 1}: 处理成功
     {[str], 1}: 处理成功
@@ -377,11 +437,11 @@ def _convert():
     # snapshot = tracemalloc.take_snapshot()
     # snapshot = tracemalloc.take_snapshot()
 
 
     _global._init()
     _global._init()
-    _global.update({"md5": "1"+"0"*15})
+    _global.update({"md5": "1" + "0" * 15})
     set_flask_global()
     set_flask_global()
     # _global.update({"port": str(port)})
     # _global.update({"port": str(port)})
 
 
-    log("into convert")
+    log("into _convert")
     start_time = time.time()
     start_time = time.time()
     _md5 = _global.get("md5")
     _md5 = _global.get("md5")
     _type = None
     _type = None
@@ -395,12 +455,12 @@ def _convert():
         file_path = data.get("file_path")
         file_path = data.get("file_path")
         if file_path is None:
         if file_path is None:
             stream = base64.b64decode(data.get("file"))
             stream = base64.b64decode(data.get("file"))
-            log("get bytes from file " + str(time.time()-_time))
+            log("get bytes from file " + str(time.time() - _time))
         # 有路径则直接取路径打开文件
         # 有路径则直接取路径打开文件
         else:
         else:
             with open(file_path, "rb") as f:
             with open(file_path, "rb") as f:
                 stream = f.read()
                 stream = f.read()
-            log("get bytes from file_path " + str(time.time()-_time))
+            log("get bytes from file_path " + str(time.time() - _time))
         _type = data.get("type")
         _type = data.get("type")
         _md5 = get_md5_from_bytes(stream)
         _md5 = get_md5_from_bytes(stream)
         _md5 = _md5[0]
         _md5 = _md5[0]
@@ -427,7 +487,8 @@ def _convert():
             # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
             # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
             # text, swf_images = origin_unique_temp_file_process(stream, _type)
             # text, swf_images = origin_unique_temp_file_process(stream, _type)
             try:
             try:
-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle)
+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
+                                                            time_out=globals().get('time_out'), save_middle=save_middle)
             except TimeoutError:
             except TimeoutError:
                 log("convert time out! 300 sec")
                 log("convert time out! 300 sec")
                 text = [-5]
                 text = [-5]
@@ -435,7 +496,8 @@ def _convert():
         else:
         else:
             # Linux 通过装饰器设置整个转换超时时间
             # Linux 通过装饰器设置整个转换超时时间
             try:
             try:
-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle)
+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
+                                                            time_out=globals().get('time_out'), save_middle=save_middle)
             except TimeoutError:
             except TimeoutError:
                 log("convert time out! 300 sec")
                 log("convert time out! 300 sec")
                 text = [-5]
                 text = [-5]
@@ -447,11 +509,12 @@ def _convert():
                 is_success = 1
                 is_success = 1
             else:
             else:
                 is_success = 0
                 is_success = 0
-            log("md5: " + str(_md5)
-                         + " finished result: " + str(text)
-                         + " is_success: " + str(is_success) + " "
-                         + str(_type) + " "
-                         + " " + str(time.time() - start_time))
+            log("md5: " + str(_md5) + " "
+                + "finished result: " + str(text) + " "
+                + "is_success: " + str(is_success) + " "
+                + str(_type) + " "
+                + 'None '
+                + str(round(time.time() - start_time, 2)))
             return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
             return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
                                "is_success": is_success, "swf_images": str(swf_images)})
                                "is_success": is_success, "swf_images": str(swf_images)})
 
 
@@ -484,16 +547,17 @@ def _convert():
         if only_text[0] == '' and len(only_text) <= 1:
         if only_text[0] == '' and len(only_text) <= 1:
             print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
             print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
             log("md5: " + str(_md5) + " "
             log("md5: " + str(_md5) + " "
-                + " finished result: ['', 0] is_success: 1 "
+                + "finished result: ['', 0] is_success: 1 "
                 + str(_type) + " "
                 + str(_type) + " "
-                + str(time.time() - start_time))
+                + 'None '
+                + str(round(time.time() - start_time, 2)))
         else:
         else:
-            log("md5: " + str(_md5) +
-                " finished result: " + str(only_text)[:20] + " "
+            log("md5: " + str(_md5) + " "
+                + "finished result: " + str(only_text)[:20] + " "
                 + str(len(str(text))) + " is_success: 1 "
                 + str(len(str(text))) + " is_success: 1 "
                 + str(_type) + " "
                 + str(_type) + " "
                 + str(classification) + " "
                 + str(classification) + " "
-                + str(time.time() - start_time))
+                + str(round(time.time() - start_time, 2)))
 
 
         # log("growth end" + str(objgraph.growth()))
         # log("growth end" + str(objgraph.growth()))
         # log("most_common_types end" + str(objgraph.most_common_types(20)))
         # log("most_common_types end" + str(objgraph.most_common_types(20)))
@@ -502,15 +566,24 @@ def _convert():
                            "classification": classification})
                            "classification": classification})
 
 
     except ConnectionError:
     except ConnectionError:
-        log("convert post has no data!" + " failed result: [-2] is_success: 0 "
-            + str(time.time() - start_time))
+        # log("convert post has no data!" + " failed result: [-2] is_success: 0 "
+        #     + str(round(time.time() - start_time, 2)))
+        log("md5: " + str(_md5) + " "
+            + "failed result: [-2] is_success: 0 "
+            + str(_type) + " "
+            + "None "
+            + str(round(time.time() - start_time, 2))
+            )
         return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
         return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
                            "is_success": 0, "swf_images": str([]),
                            "is_success": 0, "swf_images": str([]),
                            "classification": ""})
                            "classification": ""})
     except Exception as e:
     except Exception as e:
-        log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
-            + str(_type) + " " +
-            str(time.time() - start_time))
+        log("md5: " + str(_md5) + " "
+            + "failed result: [-1] is_success: 0 "
+            + str(_type) + " "
+            + "None "
+            + str(round(time.time() - start_time, 2))
+            )
         traceback.print_exc()
         traceback.print_exc()
         return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
         return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
                            "is_success": 0, "swf_images": str([]),
                            "is_success": 0, "swf_images": str([]),
@@ -545,6 +618,146 @@ def _convert():
 
 
 
 
 def convert(data):
 def convert(data):
+    """
+    接口返回值:
+    :return: {"result_html": [str], "result_text": [str],
+              "is_success": int, "swf_images": str(list)}
+    """
+    log("into convert")
+    start_time = time.time()
+
+    # 初始化
+    _global._init()
+    _global.update({"md5": "1" + "0" * 15})
+    set_flask_global()
+    # 文件md5
+    _md5 = _global.get("md5")
+    # 文件类型
+    _type = None
+    try:
+        if not data:
+            log("convert no data!")
+            raise ConnectionError
+
+        file_path = data.get("file_path")
+        if file_path is None:
+            stream = base64.b64decode(data.get("file"))
+            log("get bytes from file " + str(time.time() - start_time))
+        # 有路径则直接取路径打开文件
+        else:
+            with open(file_path, "rb") as f:
+                stream = f.read()
+            log("get bytes from file_path " + str(time.time() - start_time))
+
+        # 获取真实值
+        _type = data.get("type")
+        _md5 = get_md5_from_bytes(stream)
+        _md5 = _md5[0]
+        _global.update({"md5": _md5})
+
+        # 指定页码范围
+        _page_no = data.get('page_no')
+
+        # 指定timeout
+        _timeout = data.get('timeout')
+        if _timeout is not None:
+            globals().update({"time_out": _timeout})
+
+        # 是否保留中间文件
+        save_middle = data.get('save_middle')
+
+        # 最终结果截取的最大字节数
+        max_bytes = data.get("max_bytes")
+
+        # 开始转换,并且控制时间
+        try:
+            text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
+                                                        time_out=globals().get('time_out'), save_middle=save_middle)
+        except TimeoutError:
+            log("convert time out! 300 sec")
+            text = [-5]
+            swf_images = []
+
+        # 报错依然成功的
+        still_success_code = [-3, -4, -7]
+        if judge_error_code(text):
+            if judge_error_code(text, still_success_code):
+                is_success = 1
+            else:
+                is_success = 0
+            log_convert_result(_md5, text, "", is_success,
+                               _type, None, start_time)
+            return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
+                               "is_success": is_success, "swf_images": str(swf_images)})
+
+        # 结果保存result.html
+        text_str = ""
+        for t in text:
+            text_str += t
+        to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
+
+        # 取纯文本
+        only_text = []
+        for t in text:
+            new_t = BeautifulSoup(t, "lxml").get_text()
+            new_t = re.sub("\n", "", new_t)
+            only_text.append(new_t)
+
+        # 判断附件类型
+        classification = from_atc_interface(' '.join(only_text))
+        if judge_error_code(classification):
+            classification = [str(classification[0])]
+
+        # 判断长度,过长截取
+        text = cut_str(text, only_text, max_bytes)
+        only_text = cut_str(only_text, only_text)
+
+        if len(only_text) == 0:
+            only_text = [""]
+
+        if only_text[0] == '' and len(only_text) <= 1:
+            log_convert_result(_md5, '', '', 1,
+                               _type, None, start_time)
+        else:
+            log_convert_result(_md5, only_text, text, 1,
+                               _type, classification, start_time)
+        return json.dumps({"result_html": text, "result_text": only_text,
+                           "is_success": 1, "swf_images": str(swf_images),
+                           "classification": classification})
+
+    except ConnectionError:
+        log_convert_result(_md5, [-2], "", 0,
+                           _type, None, start_time)
+        return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
+                           "is_success": 0, "swf_images": str([]),
+                           "classification": ""})
+    except Exception:
+        log_convert_result(_md5, [-1], "", 0,
+                           _type, None, start_time)
+        traceback.print_exc()
+        return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
+                           "is_success": 0, "swf_images": str([]),
+                           "classification": ""})
+    finally:
+        pass
+        # log("finally")
+
+
+def log_convert_result(_md5, only_text, text, is_success, _type, _attach_class, start_time):
+    str_list = [
+        "md5: " + str(_md5),
+        "finished result: " + re.sub(' ', '', str(only_text)[:20]),
+        str(len(str(text))),
+        "is_success: " + str(is_success),
+        str(_type),
+        str(_attach_class),
+        str(round(time.time()-start_time, 3)),
+    ]
+    info = ' '.join(str_list)
+    log(info)
+
+
+def convert_old_250613(data):
     """
     """
     接口返回值:
     接口返回值:
     {[str], 1}: 处理成功
     {[str], 1}: 处理成功
@@ -558,7 +771,7 @@ def convert(data):
     :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
     :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
     """
     """
     _global._init()
     _global._init()
-    _global.update({"md5": "1"+"0"*15})
+    _global.update({"md5": "1" + "0" * 15})
     set_flask_global()
     set_flask_global()
 
 
     log("into convert")
     log("into convert")
@@ -584,7 +797,8 @@ def convert(data):
             # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
             # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
             # text, swf_images = origin_unique_temp_file_process(stream, _type)
             # text, swf_images = origin_unique_temp_file_process(stream, _type)
             try:
             try:
-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
+                                                            time_out=globals().get('time_out'))
             except TimeoutError:
             except TimeoutError:
                 log("convert time out! 300 sec")
                 log("convert time out! 300 sec")
                 text = [-5]
                 text = [-5]
@@ -592,7 +806,8 @@ def convert(data):
         else:
         else:
             # Linux 通过装饰器设置整个转换超时时间
             # Linux 通过装饰器设置整个转换超时时间
             try:
             try:
-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
+                                                            time_out=globals().get('time_out'))
             except TimeoutError:
             except TimeoutError:
                 log("convert time out! 300 sec")
                 log("convert time out! 300 sec")
                 text = [-5]
                 text = [-5]
@@ -604,11 +819,12 @@ def convert(data):
                 is_success = 1
                 is_success = 1
             else:
             else:
                 is_success = 0
                 is_success = 0
-            log("md5: " + str(_md5)
-                + " finished result: " + str(text)
-                + " is_success: " + str(is_success) + " "
+            log("md5: " + str(_md5) + " "
+                + "finished result: " + str(text) + " "
+                + "is_success: " + str(is_success) + " "
                 + str(_type) + " "
                 + str(_type) + " "
-                + " " + str(time.time() - start_time))
+                + "None "
+                + str(round(time.time() - start_time, 2)))
             return {"result_html": [str(text[0])], "result_text": [str(text[0])],
             return {"result_html": [str(text[0])], "result_text": [str(text[0])],
                     "is_success": is_success, "swf_images": str(swf_images)}
                     "is_success": is_success, "swf_images": str(swf_images)}
 
 
@@ -639,18 +855,19 @@ def convert(data):
             only_text = [""]
             only_text = [""]
 
 
         if only_text[0] == '' and len(only_text) <= 1:
         if only_text[0] == '' and len(only_text) <= 1:
-            print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
+            # print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
             log("md5: " + str(_md5) + " "
             log("md5: " + str(_md5) + " "
-                + " finished result: ['', 0] is_success: 1 "
+                + "finished result: ['', 0] is_success: 1 "
                 + str(_type) + " "
                 + str(_type) + " "
-                + str(time.time() - start_time))
+                + "None "
+                + str(round(time.time() - start_time, 2)))
         else:
         else:
-            log("md5: " + str(_md5) +
-                " finished result: " + str(only_text)[:20] + " "
+            log("md5: " + str(_md5) + " "
+                + "finished result: " + str(only_text)[:20] + " "
                 + str(len(str(text))) + " is_success: 1 "
                 + str(len(str(text))) + " is_success: 1 "
                 + str(_type) + " "
                 + str(_type) + " "
                 + str(classification) + " "
                 + str(classification) + " "
-                + str(time.time() - start_time))
+                + str(round(time.time() - start_time, 2)))
 
 
         return {"result_html": text, "result_text": only_text,
         return {"result_html": text, "result_text": only_text,
                 "is_success": 1, "swf_images": str(swf_images),
                 "is_success": 1, "swf_images": str(swf_images),
@@ -658,7 +875,7 @@ def convert(data):
 
 
     except ConnectionError:
     except ConnectionError:
         log("convert post has no data!" + " failed result: [-2] is_success: 0 "
         log("convert post has no data!" + " failed result: [-2] is_success: 0 "
-            + str(time.time() - start_time))
+            + str(round(time.time() - start_time, 2)))
         return {"result_html": ["-2"], "result_text": ["-2"],
         return {"result_html": ["-2"], "result_text": ["-2"],
                 "is_success": 0, "swf_images": str([]),
                 "is_success": 0, "swf_images": str([]),
                 "classification": ""}
                 "classification": ""}
@@ -689,7 +906,7 @@ def convert_old(data, ocr_model, otr_model):
     """
     """
     log("into convert")
     log("into convert")
     _global._init()
     _global._init()
-    _global.update({"md5": "1"+"0"*15})
+    _global.update({"md5": "1" + "0" * 15})
     # set_flask_global()
     # set_flask_global()
 
 
     start_time = time.time()
     start_time = time.time()
@@ -706,7 +923,7 @@ def convert_old(data, ocr_model, otr_model):
         _md5 = get_md5_from_bytes(stream)
         _md5 = get_md5_from_bytes(stream)
         _md5 = _md5[0]
         _md5 = _md5[0]
         _global.update({"md5": _md5})
         _global.update({"md5": _md5})
-        log("get bytes from file " + str(time.time()-_time))
+        log("get bytes from file " + str(time.time() - _time))
 
 
         if get_platform() == "Windows":
         if get_platform() == "Windows":
             try:
             try:
@@ -730,11 +947,12 @@ def convert_old(data, ocr_model, otr_model):
                 is_success = 1
                 is_success = 1
             else:
             else:
                 is_success = 0
                 is_success = 0
-            log("md5: " + str(_md5)
-                + " finished result: " + str(text)
-                + " is_success: " + str(is_success) + " "
+            log("md5: " + str(_md5) + " "
+                + "finished result: " + str(text) + " "
+                + "is_success: " + str(is_success) + " "
                 + str(_type) + " "
                 + str(_type) + " "
-                + " " + str(time.time() - start_time))
+                + "None "
+                + str(round(time.time() - start_time, 2)))
             return {"result_html": [str(text[0])], "result_text": [str(text[0])],
             return {"result_html": [str(text[0])], "result_text": [str(text[0])],
                     "is_success": is_success, "swf_images": str(swf_images)}
                     "is_success": is_success, "swf_images": str(swf_images)}
 
 
@@ -761,22 +979,24 @@ def convert_old(data, ocr_model, otr_model):
         if only_text[0] == '' and len(only_text) <= 1:
         if only_text[0] == '' and len(only_text) <= 1:
             print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
             print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
             log("md5: " + str(_md5) + " "
             log("md5: " + str(_md5) + " "
-                + " finished result: ['', 0] is_success: 1 "
+                + "finished result: ['', 0] is_success: 1 "
                 + str(_type) + " "
                 + str(_type) + " "
-                + str(time.time() - start_time))
+                + "None "
+                + str(round(time.time() - start_time, 2)))
         else:
         else:
-            log("md5: " + str(_md5) +
-                " finished result: " + str(only_text)[:20] + " "
+            log("md5: " + str(_md5) + " "
+                + "finished result: " + str(only_text)[:20] + " "
                 + str(len(str(text))) + " is_success: 1 "
                 + str(len(str(text))) + " is_success: 1 "
                 + str(_type) + " "
                 + str(_type) + " "
-                + str(time.time() - start_time))
+                + "None "
+                + str(round(time.time() - start_time, 2)))
 
 
         return {"result_html": text, "result_text": only_text,
         return {"result_html": text, "result_text": only_text,
                 "is_success": 1, "swf_images": str(swf_images)}
                 "is_success": 1, "swf_images": str(swf_images)}
 
 
     except ConnectionError:
     except ConnectionError:
         log("convert post has no data!" + " failed result: [-2] is_success: 0 "
         log("convert post has no data!" + " failed result: [-2] is_success: 0 "
-            + str(time.time() - start_time))
+            + str(round(time.time() - start_time, 2)))
         return {"result_html": ["-2"], "result_text": ["-2"],
         return {"result_html": ["-2"], "result_text": ["-2"],
                 "is_success": 0, "swf_images": str([])}
                 "is_success": 0, "swf_images": str([])}
     except Exception as e:
     except Exception as e:
@@ -801,9 +1021,9 @@ def test_more(_dir, process_no=None):
     for p in file_path_list:
     for p in file_path_list:
         if i % 10 == 0:
         if i % 10 == 0:
             if process_no is not None:
             if process_no is not None:
-                print("Process", process_no, i, time.time()-start_time)
+                print("Process", process_no, i, time.time() - start_time)
             else:
             else:
-                print("Loop", i, time.time()-start_time)
+                print("Loop", i, time.time() - start_time)
         test_one(p, from_remote=True)
         test_one(p, from_remote=True)
         i += 1
         i += 1
 
 
@@ -847,79 +1067,28 @@ def test_duplicate(path_list, process_no=None):
     for i in range(500):
     for i in range(500):
         if i % 10 == 0:
         if i % 10 == 0:
             if process_no is not None:
             if process_no is not None:
-                print("Process", process_no, i*len(path_list), time.time()-start_time)
+                print("Process", process_no, i * len(path_list), time.time() - start_time)
             else:
             else:
-                print("Loop", i*len(path_list), time.time()-start_time)
+                print("Loop", i * len(path_list), time.time() - start_time)
         for p in path_list:
         for p in path_list:
             test_one(p, from_remote=True)
             test_one(p, from_remote=True)
 
 
 
 
-global_type = ""
-local_url = "http://127.0.0.1"
-if get_platform() == "Windows":
-    _path = os.path.abspath(os.path.dirname(__file__))
-else:
-    _path = "/home/admin"
-    if not os.path.exists(_path):
-        _path = os.path.dirname(os.path.abspath(__file__))
+# global_type = ""
+# local_url = "http://127.0.0.1"
+# if get_platform() == "Windows":
+#     _path = os.path.abspath(os.path.dirname(__file__))
+# else:
+#     _path = "/home/admin"
+#     if not os.path.exists(_path):
+#         _path = os.path.dirname(os.path.abspath(__file__))
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    # convert interface
-    if len(sys.argv) == 2:
-        port = int(sys.argv[1])
-    else:
-        port = 15010
-
-    globals().update({"md5": "1"+"0"*15})
+    port = 15010
+    globals().update({"md5": "1" + "0" * 15})
     globals().update({"port": str(port)})
     globals().update({"port": str(port)})
-    # _global._init()
-    # _global.update({"md5": "1"+"0"*15})
-    # _global.update({"port": str(port)})
-
-    # ip = get_intranet_ip()
-    # log("my ip"+str(ip))
-    # ip = "http://" + ip
     ip_port_dict = get_ip_port()
     ip_port_dict = get_ip_port()
-
     set_flask_global()
     set_flask_global()
+    app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
 
 
-    if get_platform() == "Windows":
-        app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
-    else:
-        # app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
-        app.run(port=15011)
-
-    # if get_platform() == "Windows":
-    #     file_path = "C:/Users/Administrator/Desktop/test_image/error29.png"
-    #     # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
-    #     # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
-    #     # file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
-    # else:
-    #     file_path = "test1.doc"
-    # test_one(file_path, from_remote=True)
-
-    # if get_platform() == "Windows":
-    #     file_dir = "D:/BIDI_DOC/比地_文档/table_images/"
-    # else:
-    #     file_dir = "../table_images/"
-    #
-    # for j in range(10):
-    #     p = Process(target=test_more, args=(file_dir, j, ))
-    #     p.start()
-    # p.join()
-
-    # if get_platform() == "Windows":
-    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
-    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls",
-    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/11111111.rar"]
-    #     file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
-    #                       "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"]
-    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc"]
-    #
-    # else:
-    #     file_path_list = ["test1.pdf"]
-    # for j in range(10):
-    #     p = Process(target=test_duplicate, args=(file_path_list, j, ))
-    #     p.start()
-    # p.join()

+ 94 - 22
format_convert/convert_doc.py

@@ -6,7 +6,7 @@ import sys
 import chardet
 import chardet
 from bs4 import BeautifulSoup
 from bs4 import BeautifulSoup
 sys.path.append(os.path.dirname(__file__) + "/../")
 sys.path.append(os.path.dirname(__file__) + "/../")
-from format_convert.convert_tree import _Document, _Sentence, _Page
+from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
 import logging
 import logging
 import traceback
 import traceback
 from format_convert import get_memory_info
 from format_convert import get_memory_info
@@ -35,11 +35,71 @@ def doc2text(path, unique_type_dir):
 class DocConvert:
 class DocConvert:
     def __init__(self, path, unique_type_dir):
     def __init__(self, path, unique_type_dir):
         self._doc = _Document(path)
         self._doc = _Document(path)
+        self._page = _Page(None, 0)
         self.path = path
         self.path = path
         self.unique_type_dir = unique_type_dir
         self.unique_type_dir = unique_type_dir
         self.tika_html = None
         self.tika_html = None
+        print('into DocConvert __init__')
 
 
     def convert(self):
     def convert(self):
+        print('into DocConvert convert')
+        # 先判断特殊doc文件,可能是html文本
+        # is_html_doc = False
+        # try:
+        #     try:
+        #         with open(self.path, 'r') as f:
+        #             html_str = f.read()
+        #     except UnicodeDecodeError:
+        #         with open(self.path, 'r', errors='ignore') as f:
+        #             html_str = f.read()
+        #     # if re.search('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str):
+        #     if len(re.findall('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str)) >= 10:
+        #         log('doc as html!')
+        #         soup = BeautifulSoup(html_str, 'lxml')
+        #         text = soup.text
+        #         is_html_doc = True
+        # except:
+        #     pass
+        #
+        # if is_html_doc:
+        #     self._page = _Page(None, 0)
+        #     _sen = _Sentence(text, (0, 0, 0, 0))
+        #     self._page.add_child(_sen)
+        #     self._doc.add_child(self._page)
+
+        # 先判断特殊doc文件,可能是html文本
+        is_html_doc = self.maybe_html()
+
+        if not is_html_doc:
+            # 调用office格式转换
+            file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
+            if judge_error_code(file_path):
+                # office转换失败,调用tika,提取各个类型对象
+                try:
+                    self.use_tika(self.path)
+                except:
+                    traceback.print_exc()
+                    self._doc.error_code = [-17]
+                    log('doc tika failed too')
+                return
+
+            _docx = DocxConvert(file_path, self.unique_type_dir)
+            _docx.convert()
+            self._doc = _docx._doc
+            # if self._doc.error_code is not None:
+            #     # docx提取失败,调用tika,提取各个类型对象
+            #     print('DocxConvert failed use_tika')
+            #     self.use_tika(self.path)
+            #     self._doc.error_code = None
+            #     # # 调用tika提取
+            #     # html = from_tika_interface(self.path)
+            #     # if judge_error_code(html):
+            #     #     self._doc.error_code = html
+            #     # self.tika_html = html
+            #     # self._doc.error_code = None
+            #     return
+
+    def maybe_html(self):
         # 先判断特殊doc文件,可能是html文本
         # 先判断特殊doc文件,可能是html文本
         is_html_doc = False
         is_html_doc = False
         try:
         try:
@@ -63,27 +123,39 @@ class DocConvert:
             _sen = _Sentence(text, (0, 0, 0, 0))
             _sen = _Sentence(text, (0, 0, 0, 0))
             self._page.add_child(_sen)
             self._page.add_child(_sen)
             self._doc.add_child(self._page)
             self._doc.add_child(self._page)
-        else:
-            # 调用office格式转换
-            file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
-            if judge_error_code(file_path):
-                # 调用tika提取
-                html = from_tika_interface(self.path)
-                if judge_error_code(html):
-                    self._doc.error_code = html
-                self.tika_html = html
-                return
-            _docx = DocxConvert(file_path, self.unique_type_dir)
-            _docx.convert()
-            self._doc = _docx._doc
-            if self._doc.error_code is not None:
-                # 调用tika提取
-                html = from_tika_interface(self.path)
-                if judge_error_code(html):
-                    self._doc.error_code = html
-                self.tika_html = html
-                self._doc.error_code = None
-                return
+
+        return is_html_doc
+
+    def use_tika(self, _path):
+        # 调用tika提取
+        # html = from_tika_interface(self.path)
+        # if judge_error_code(html):
+        #     self._doc.error_code = html
+        # self.tika_html = html
+        data = from_tika_interface(_path)
+        if judge_error_code(data):
+            self._doc.error_code = data
+            return
+        current_y = 5
+        for di, d in enumerate(data):
+            data_type, value = d
+            bbox = [0, current_y, 20, current_y+10]
+            current_y += 20
+            if data_type == 'text':
+                _sen = _Sentence(value, bbox)
+                _sen.combine = False
+                self._page.add_child(_sen)
+            elif data_type == 'img':
+                with open(value, "rb") as f:
+                    img = f.read()
+                _img = _Image(img, value, bbox)
+                _img.is_from_docx = True
+                self._page.add_child(_img)
+            elif data_type == 'table':
+                _table = _Table(value, bbox)
+                _table.is_html = True
+                self._page.add_child(_table)
+        self._doc.add_child(self._page)
 
 
     def get_html(self):
     def get_html(self):
         try:
         try:

+ 205 - 18
format_convert/convert_docx.py

@@ -10,7 +10,8 @@ import xml
 import zipfile
 import zipfile
 import docx
 import docx
 from bs4 import BeautifulSoup
 from bs4 import BeautifulSoup
-from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
+from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code, \
+    get_table_html
 from format_convert.wrapt_timeout_decorator import timeout
 from format_convert.wrapt_timeout_decorator import timeout
 from format_convert.convert_image import ImageConvert
 from format_convert.convert_image import ImageConvert
 from format_convert.convert_need_interface import from_tika_interface
 from format_convert.convert_need_interface import from_tika_interface
@@ -313,7 +314,7 @@ def read_xml_order(unique_type_dir, document_xml, numbering_xml, document_xml_re
 
 
 @timeout(50, timeout_exception=TimeoutError)
 @timeout(50, timeout_exception=TimeoutError)
 def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_rels):
 def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_rels):
-    def recursion_read_table(table):
+    def recursion_read_table(table, show=0):
         table_text = '<table border="1">'
         table_text = '<table border="1">'
         tr_index = 0
         tr_index = 0
         tr_text_list = []
         tr_text_list = []
@@ -349,6 +350,7 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
                             if is_merge == "continue":
                             if is_merge == "continue":
                                 row_span_dict[tc_index][0] += 1
                                 row_span_dict[tc_index][0] += 1
                                 tc_index += col_span
                                 tc_index += col_span
+                                tc_text_list.append([tc_text, col_span])
                                 # 跳过,不增加td
                                 # 跳过,不增加td
                                 continue
                                 continue
                                 # col_span_index = 0
                                 # col_span_index = 0
@@ -403,6 +405,11 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
                 tr_index += 1
                 tr_index += 1
                 tr_text_list.append(tc_text_list)
                 tr_text_list.append(tc_text_list)
 
 
+        if show:
+            for row in tr_text_list:
+                print('row', row)
+                print('len(row)', len(row))
+
         # 替换所有row_span
         # 替换所有row_span
         for key in row_span_dict.keys():
         for key in row_span_dict.keys():
             row_span, finish_row_span_flag = row_span_dict.get(key)
             row_span, finish_row_span_flag = row_span_dict.get(key)
@@ -420,7 +427,8 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
         for node in body_nodes:
         for node in body_nodes:
             if 'w:tbl' in str(node).split(' '):
             if 'w:tbl' in str(node).split(' '):
                 _table = node
                 _table = node
-                _table_text = recursion_read_table(_table)
+                # _table_text = recursion_read_table(_table)
+                _table_text = xml_table_to_html(_table, unique_type_dir, numbering_xml, document_xml_rels)
                 table_text_list.append(_table_text)
                 table_text_list.append(_table_text)
         return table_text_list
         return table_text_list
 
 
@@ -430,6 +438,146 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
         return [-1]
         return [-1]
 
 
 
 
+def xml_table_to_html(table, unique_type_dir, numbering_xml, document_xml_rels, show=0):
+    tr_index = 0
+    tr_text_list = []
+    last_node_level = 0
+    num_pr_dict = {}
+
+    # 直接子节点用child表示,所有子节点用all表示
+    for table_child in table.childNodes:
+        if 'w:tr' in str(table_child):
+            tr = table_child
+            tr_child_nodes = tr.childNodes
+            tc_index = 0
+            tc_text_list = []
+            for tr_child in tr_child_nodes:
+                if 'w:tc' in str(tr_child).split(' '):
+                    tc_text = ""
+                    tc = tr_child
+                    # 获取一格占多少列,相当于colspan
+                    col_span = tc.getElementsByTagName("w:gridSpan")
+                    if col_span:
+                        col_span = int(col_span[0].getAttribute("w:val"))
+                    else:
+                        col_span = 1
+                    # 获取是否是合并单元格的下一个空单元格,相当于rowspan
+                    is_merge = tc.getElementsByTagName("w:vMerge")
+                    if is_merge:
+                        is_merge = is_merge[0].getAttribute("w:val")
+                        if is_merge == "continue":
+                            tc_index += col_span
+                            tc_text = '@continue@'
+                            tc_text_list.append([tc_text, col_span])
+                            # 跳过,不增加td
+                            continue
+
+                    # 放入文本
+                    tc_child_nodes = tc.childNodes
+                    for tc_child in tc_child_nodes:
+                        # 处理嵌套在tc中的表格
+                        if 'w:tbl' in str(tc_child).split(' '):
+                            tc_text += xml_table_to_html(tc_child, unique_type_dir, numbering_xml, document_xml_rels)
+                        # 处理编号
+                        if 'w:p' in str(tc_child).split(' '):
+                            _t_list, _, num_pr_dict, last_node_level = read_p_text(unique_type_dir,
+                                                                                   tc_child,
+                                                                                   last_node_level,
+                                                                                   num_pr_dict,
+                                                                                   numbering_xml,
+                                                                                   document_xml_rels)
+                            tc_text += ''.join(_t_list)
+                    # 结束该tc
+                    tc_index += col_span
+                    tc_text_list.append([tc_text, col_span])
+            # 结束该tr
+            tr_index += 1
+            tr_text_list.append(tc_text_list)
+
+    if show:
+        for row in tr_text_list:
+            print('row', row)
+            print('len(row)', len(row))
+
+    table_html = row_list_to_table(tr_text_list)
+    return table_html
+
+
+def row_list_to_table(row_list, show=0):
+    if show:
+        print('='*50)
+
+    # 复制合并列
+    new_row_list = []
+    for row in row_list:
+        new_row = []
+        for col, col_span in row:
+            new_row += [[col, col_span]]
+            if col_span > 1:
+                new_row += [[col, 0]] * (col_span - 1)
+        new_row_list.append(new_row)
+    row_list = new_row_list
+
+    if show:
+        for row in row_list:
+            print('copy row', row)
+
+    # 计算是不是每行都有相等列数
+    row_cnt_list = []
+    for row in row_list:
+        row_cnt_list.append(len(row))
+
+    if len(set(row_cnt_list)) != 1:
+        log('表格有列数不同,直接返回text' + str(row_cnt_list))
+        # 直接返回所有col的text
+        text = ''
+        for row in row_list:
+            for col, col_span in row:
+                text += col
+        return text
+
+    new_row_list = []
+    for ri, row in enumerate(row_list):
+        new_row = []
+        for ci, col in enumerate(row):
+            col, col_span = col
+            row_span = 1
+            # 判断下面行同列有没有需合并的
+            for ri2 in range(ri+1, len(row_list)):
+                col2, col_span2 = row_list[ri2][ci]
+                if col2 == '@continue@':
+                    row_span += 1
+                else:
+                    break
+
+            # 需跳过的列
+            if col == '@continue@' or col_span == 0:
+                delete = 1
+            else:
+                delete = 0
+
+            col_dict = {
+                'text': col,
+                'rowspan': row_span,
+                'columnspan': col_span,
+                'delete': delete,
+            }
+            new_row.append(col_dict)
+        new_row_list.append(new_row)
+
+    if show:
+        for new_row in new_row_list:
+            print('new_row', new_row)
+
+    table_html = get_table_html(new_row_list)
+
+    # soup = BeautifulSoup(table_html, 'lxml')
+    # print(soup.prettify())
+    if show:
+        print('-' * 50)
+    return table_html
+
+
 @timeout(25, timeout_exception=TimeoutError)
 @timeout(25, timeout_exception=TimeoutError)
 def parse_xml(path):
 def parse_xml(path):
     # 解析xml
     # 解析xml
@@ -449,6 +597,7 @@ def parse_xml2(path):
 class DocxConvert:
 class DocxConvert:
     def __init__(self, path, unique_type_dir):
     def __init__(self, path, unique_type_dir):
         self._doc = _Document(path)
         self._doc = _Document(path)
+        self._page = _Page(None, 0)
         self.path = path
         self.path = path
         self.unique_type_dir = unique_type_dir
         self.unique_type_dir = unique_type_dir
 
 
@@ -497,8 +646,6 @@ class DocxConvert:
             self._doc.error_code = [-3]
             self._doc.error_code = [-3]
 
 
     def convert(self):
     def convert(self):
-        self._page = _Page(None, 0)
-
         # 先判断特殊doc文件,可能是html文本
         # 先判断特殊doc文件,可能是html文本
         is_html_doc = False
         is_html_doc = False
         try:
         try:
@@ -630,23 +777,62 @@ class DocxConvert:
     def get_doc_object(self):
     def get_doc_object(self):
         return self._doc
         return self._doc
 
 
+    def use_tika(self, _path):
+        # 调用tika提取
+        # html = from_tika_interface(self.path)
+        # if judge_error_code(html):
+        #     self._doc.error_code = html
+        # self.tika_html = html
+        data = from_tika_interface(_path)
+        if judge_error_code(data):
+            self._doc.error_code = data
+            return
+        current_y = 5
+        for di, d in enumerate(data):
+            data_type, value = d
+            bbox = [0, current_y, 20, current_y+10]
+            current_y += 20
+            if data_type == 'text':
+                _sen = _Sentence(value, bbox)
+                _sen.combine = False
+                self._page.add_child(_sen)
+            elif data_type == 'img':
+                with open(value, "rb") as f:
+                    img = f.read()
+                _img = _Image(img, value, bbox)
+                _img.is_from_docx = True
+                self._page.add_child(_img)
+            elif data_type == 'table':
+                _table = _Table(value, bbox)
+                _table.is_html = True
+                self._page.add_child(_table)
+        self._doc.add_child(self._page)
+
     def get_html(self):
     def get_html(self):
         if self._doc.error_code is not None:
         if self._doc.error_code is not None:
             return self._doc.error_code
             return self._doc.error_code
         try:
         try:
+            # raise
             self.convert()
             self.convert()
         except:
         except:
             traceback.print_exc()
             traceback.print_exc()
             self._doc.error_code = [-1]
             self._doc.error_code = [-1]
         # log('docx error code ' + str(self._doc.error_code))
         # log('docx error code ' + str(self._doc.error_code))
         if self._doc.error_code is not None:
         if self._doc.error_code is not None:
-            # 调用tika提取
-            html = from_tika_interface(self.path)
-            if judge_error_code(html):
-                self._doc.error_code = html
-                return self._doc.error_code
-            else:
-                return [html]
+            # # 调用tika提取
+            # html = from_tika_interface(self.path)
+            # if judge_error_code(html):
+            #     self._doc.error_code = html
+            #     return self._doc.error_code
+            # else:
+            #     return [html]
+            try:
+                self.use_tika(self.path)
+                self._doc.error_code = None
+            except:
+                traceback.print_exc()
+                log('docx tika failed too')
+                self._doc.error_code = [-17]
         return self._doc.get_html()
         return self._doc.get_html()
 
 
 
 
@@ -791,9 +977,10 @@ class DocxConvertNew:
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    c = DocxConvert("C:/Users/Administrator/Downloads/dsdsd.docx", "C:/Users/Administrator/Downloads/1/")
-    print(c.get_html())
-
-    # c = DocxConvertNew()
-    # # c.read_docx(r'C:\Users\Administrator\Desktop\test_doc\error14.docx')
-    # c.read_docx(r'C:/Users/Administrator/Downloads/dsdsd.docx')
+    _p = r'C:/Users/Administrator/Downloads/1723004790329.docx'
+    # _p = "C:/Users/Administrator/Desktop/test_doc/error14.docx"
+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
+    c = DocxConvert(_p, save_dir)
+    _html = c.get_html()
+    with open('../result.html', 'w', encoding='utf-8') as f:
+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + str(_html[0]))

+ 819 - 25
format_convert/convert_image.py

@@ -21,7 +21,7 @@ from format_convert.utils import judge_error_code, add_div, LineTable, get_table
 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface, from_gpu_interface_redis, \
 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface, from_gpu_interface_redis, \
     from_idc_interface, from_isr_interface
     from_idc_interface, from_isr_interface
 from format_convert.table_correct import get_rotated_image
 from format_convert.table_correct import get_rotated_image
-from botr.extract_table import get_table
+from botr.extract_table import get_table, get_b_table_by_blank_colon
 
 
 
 
 def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
@@ -66,7 +66,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
     def merge_textbox(textbox_list, in_objs):
     def merge_textbox(textbox_list, in_objs):
         delete_obj = []
         delete_obj = []
         threshold = 5
         threshold = 5
-        textbox_list.sort(key=lambda x:x.bbox[0])
+        textbox_list.sort(key=lambda x: x.bbox[0])
         for k in range(len(textbox_list)):
         for k in range(len(textbox_list)):
             tb1 = textbox_list[k]
             tb1 = textbox_list[k]
             if tb1 not in in_objs and tb1 not in delete_obj:
             if tb1 not in in_objs and tb1 not in delete_obj:
@@ -74,6 +74,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                     tb2 = textbox_list[m]
                     tb2 = textbox_list[m]
                     if tb2 in in_objs:
                     if tb2 in in_objs:
                         continue
                         continue
+                    # print('tb1 tb2', tb1, tb2)
                     if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
                     if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
                             and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
                             and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
                         if tb1.bbox[0] <= tb2.bbox[0]:
                         if tb1.bbox[0] <= tb2.bbox[0]:
@@ -88,9 +89,9 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                 textbox_list.remove(_obj)
                 textbox_list.remove(_obj)
         return textbox_list
         return textbox_list
 
 
-    def resize_process(_image_np):
+    def resize_process(_image_np, threshold=2048):
+    # def resize_process(_image_np, threshold=1280):
         # 整体分辨率限制
         # 整体分辨率限制
-        threshold = 2048
         if _image_np.shape[0] > threshold or _image_np.shape[1] > threshold:
         if _image_np.shape[0] > threshold or _image_np.shape[1] > threshold:
             h, w = get_best_predict_size2(_image_np, threshold=threshold)
             h, w = get_best_predict_size2(_image_np, threshold=threshold)
             log("global image resize " + str(_image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
             log("global image resize " + str(_image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
@@ -169,14 +170,24 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
         log("isr total time "+str(time.time()-_isr_time))
         log("isr total time "+str(time.time()-_isr_time))
         return _image_np
         return _image_np
 
 
-    def ocr_process(_image_np, _threshold=2048):
+    # def ocr_process(_image_np, _threshold=2048):
+    def ocr_process(_image_np, _threshold=1080):
         log("ocr_process image shape " + str(_image_np.shape))
         log("ocr_process image shape " + str(_image_np.shape))
 
 
+        # 过小直接返回
+        if _image_np.shape[0] <= 10 or _image_np.shape[1] <= 10:
+            return [], []
+        if _image_np.shape[0] < 50 and _image_np.shape[1] / _image_np.shape[0] > 20:
+            return [], []
+        if _image_np.shape[1] < 50 and _image_np.shape[0] / _image_np.shape[1] > 20:
+            return [], []
+
         # ocr图片过大内存溢出,需resize
         # ocr图片过大内存溢出,需resize
         # 大图按比例缩小,小图维持不变;若统一拉伸成固定大小如1024会爆显存
         # 大图按比例缩小,小图维持不变;若统一拉伸成固定大小如1024会爆显存
         ratio = (1, 1)
         ratio = (1, 1)
         if _image_np.shape[0] > _threshold or _image_np.shape[1] > _threshold:
         if _image_np.shape[0] > _threshold or _image_np.shape[1] > _threshold:
-            best_h, best_w = get_best_predict_size2(_image_np, _threshold)
+            # best_h, best_w = get_best_predict_size2(_image_np, _threshold)
+            best_h, best_w = get_best_predict_size_by_area(_image_np, _threshold)
             _image_np = pil_resize(_image_np, best_h, best_w)
             _image_np = pil_resize(_image_np, best_h, best_w)
             log("ocr_process image resize " + str(_image_np.shape))
             log("ocr_process image resize " + str(_image_np.shape))
             ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
             ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
@@ -189,7 +200,13 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
 
         # 调用ocr模型接口
         # 调用ocr模型接口
         image_bytes = np2bytes(_image_np)
         image_bytes = np2bytes(_image_np)
-        text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
+        result = from_ocr_interface(image_bytes, is_table=1)
+        # print('from_ocr_interface result ', result)
+        if len(result) != 2:
+            return result, result
+
+        text_list, bbox_list = result
+        # text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
         if judge_error_code(text_list):
         if judge_error_code(text_list):
             return text_list, text_list
             return text_list, text_list
 
 
@@ -264,6 +281,13 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
 
     def botr_process(_image_np, table_list2, text_list2, box_list2, text_box_list2, obj_in_table_list2,
     def botr_process(_image_np, table_list2, text_list2, box_list2, text_box_list2, obj_in_table_list2,
                      from_pdf=False, pdf_obj_list=[], pdf_layout_size=()):
                      from_pdf=False, pdf_obj_list=[], pdf_layout_size=()):
+
+        temp_list = []
+        for _table2 in table_list2:
+            _table2 = _Table(_table2["table"], _table2["bbox"])
+            temp_list.append(_table2)
+        table_list2 = temp_list
+
         if from_pdf:
         if from_pdf:
             # 交叉验证 ocr结果与pdf obj,暂时使用pdf提取的
             # 交叉验证 ocr结果与pdf obj,暂时使用pdf提取的
             h_ratio = _image_np.shape[0] / pdf_layout_size[1]
             h_ratio = _image_np.shape[0] / pdf_layout_size[1]
@@ -300,14 +324,55 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
             box_list2 = pdf_box_list
             box_list2 = pdf_box_list
             text_box_list2 = pdf_text_box_list
             text_box_list2 = pdf_text_box_list
 
 
-        _text_box_list, _table_list, _obj_in_table_list = get_table(_image_np, table_list2, text_list2, box_list2, text_box_list2)
-
-        # 保存无边框表格文件
-        if _table_list:
+            _b_table_list = []
+            _not_b_table_list = []
+        else:
+            # 无边框新规则,补充添加 2505015
+            # 根据text规律,判断该页是否可能有无边框表格
             try:
             try:
-                save_b_table(_image_np, text_box_list2, from_pdf)
+                _b_table_list, _not_b_table_list = get_b_table_by_blank_colon(text_box_list2, table_list2, (
+                0, 0, _image_np.shape[1], _image_np.shape[0]), _image_np)
             except:
             except:
-                pass
+                traceback.print_exc()
+                return [-23], [], []
+
+            # print('_b_table_list111', _b_table_list)
+            if _b_table_list:
+                temp_list = []
+                for _b_table in _b_table_list:
+                    _b_table = _Table(_b_table[0], _b_table[1])
+                    # table_list2 += [_b_table]
+                    temp_list.append(_b_table)
+                _b_table_list = temp_list
+            if _not_b_table_list:
+                temp_list = []
+                for _b_table in _not_b_table_list:
+                    _b_table = _Table(_b_table[0], _b_table[1])
+                    temp_list.append(_b_table)
+                _not_b_table_list = temp_list
+
+        ignore_table_list = table_list2 + _b_table_list + _not_b_table_list
+        # yolo检测出的表格,忽略两列的,因为已经补充了两列的新规则 250529
+        _text_box_list, _table_list, _obj_in_table_list = get_table(_image_np, ignore_table_list, text_list2, box_list2, text_box_list2, from_pdf=from_pdf)
+        # print('_table_list', _table_list)
+        # print('_b_table_list222', _b_table_list)
+
+        # 无边框新规则,补充添加 2505015
+        _table_list = [_Table(x.get('table'), x.get('bbox')) for x in _table_list]
+        _table_list += _b_table_list
+        for _b_table in _b_table_list:
+            for _text_box in text_box_list2:
+                if _b_table.bbox[1] <= _text_box.bbox[1] <= _text_box.bbox[3] <= _b_table.bbox[3]:
+                    # print('add _obj_in_table_list 250515', _text_box)
+                    _obj_in_table_list.append(_text_box)
+        # print('_b_table_list233', _table_list)
+
+        # 保存无边框表格文件
+        # if _table_list:
+        #     try:
+        #         save_b_table(_image_np, text_box_list2, from_pdf)
+        #     except:
+        #         pass
 
 
         # print('_text_box_list', _text_box_list)
         # print('_text_box_list', _text_box_list)
         # print('_table_list', _table_list)
         # print('_table_list', _table_list)
@@ -496,7 +561,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
             else:
             else:
                 # 根据index拆开图片,重新ocr
                 # 根据index拆开图片,重新ocr
                 split_index_list.insert(0, 0)
                 split_index_list.insert(0, 0)
-                print('split_index_list1', split_index_list)
+                # print('split_index_list1', split_index_list)
                 for _i, index in enumerate(split_index_list):
                 for _i, index in enumerate(split_index_list):
                     if _i == len(split_index_list) - 1:
                     if _i == len(split_index_list) - 1:
                         split_image_np = sub_image_np[:, index:, :]
                         split_image_np = sub_image_np[:, index:, :]
@@ -602,12 +667,12 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                 # 生成TextBox对象
                 # 生成TextBox对象
                 text_box_list = get_text_box_obj(text_list, box_list)
                 text_box_list = get_text_box_obj(text_list, box_list)
                 # for t in text_box_list:
                 # for t in text_box_list:
-                #     print('text_box0', t.get_text())
+                #     print('text_box0', t)
 
 
                 # 表格生成
                 # 表格生成
                 text_box_list, table_list, obj_in_table_list = table_process(line_list, text_box_list, image_np)
                 text_box_list, table_list, obj_in_table_list = table_process(line_list, text_box_list, image_np)
                 # for t in text_box_list:
                 # for t in text_box_list:
-                #     print('text_box1', t.get_text())
+                #     print('text_box1', t)
                 # print('table_list', table_list)
                 # print('table_list', table_list)
                 # for t in obj_in_table_list:
                 # for t in obj_in_table_list:
                 #     print('obj_text_box2', t.get_text())
                 #     print('obj_text_box2', t.get_text())
@@ -625,10 +690,20 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                                                                                 pdf_layout_size,
                                                                                 pdf_layout_size,
                                                                                 )
                                                                                 )
                 log('botr process cost: ' + str(time.time()-start_time))
                 log('botr process cost: ' + str(time.time()-start_time))
+                if judge_error_code(text_box_list):
+                    return text_box_list
+
+                # print('b_table_list333', b_table_list)
+                obj_in_table_list.update(set(b_obj_in_table_list))
+                # for t in text_box_list:
+                #     print('text_box2', t)
 
 
                 # 合并非表格的同一行TextBox
                 # 合并非表格的同一行TextBox
                 text_box_list = merge_textbox(text_box_list, obj_in_table_list)
                 text_box_list = merge_textbox(text_box_list, obj_in_table_list)
 
 
+                # for t in text_box_list:
+                #     print('text_box3', t)
+                # print('table_list, b_table_list', table_list, b_table_list)
                 table_textbox_list.append([table_list, b_table_list, obj_in_table_list, text_box_list])
                 table_textbox_list.append([table_list, b_table_list, obj_in_table_list, text_box_list])
 
 
             if reverse_flag:
             if reverse_flag:
@@ -649,16 +724,21 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
             _add_y = 0
             _add_y = 0
             for table_list, b_table_list, obj_in_table_list, text_box_list in table_textbox_list:
             for table_list, b_table_list, obj_in_table_list, text_box_list in table_textbox_list:
                 obj_list = []
                 obj_list = []
+                # print('obj_in_table_list', obj_in_table_list)
                 for table in table_list:
                 for table in table_list:
-                    _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
+                    _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y,
+                                   table["bbox"][2], table["bbox"][3] + _add_y]
                     _table = _Table(table["table"], _table_bbox)
                     _table = _Table(table["table"], _table_bbox)
+                    # print('_table.bbo2x', _table.bbox)
                     obj_list.append(_table)
                     obj_list.append(_table)
                 for table in b_table_list:
                 for table in b_table_list:
-                    _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
-                    _table = _Table(table["table"], _table_bbox)
-                    obj_list.append(_table)
+                    # _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
+                    # _table = _Table(table["table"], _table_bbox)
+                    # print('table.bbo1x', table.bbox)
+                    obj_list.append(table)
                 for text_box in text_box_list:
                 for text_box in text_box_list:
                     if text_box not in obj_in_table_list:
                     if text_box not in obj_in_table_list:
+                        # print('text_box',  text_box)
                         text_box.bbox[1] += _add_y
                         text_box.bbox[1] += _add_y
                         obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
                         obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
 
 
@@ -707,6 +787,8 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                                                                         pdf_layout_size,
                                                                         pdf_layout_size,
                                                                         )
                                                                         )
             log('botr process cost: ' + str(time.time()-start_time))
             log('botr process cost: ' + str(time.time()-start_time))
+            if judge_error_code(text_box_list):
+                return text_box_list
 
 
             # 合并非表格的同一行TextBox
             # 合并非表格的同一行TextBox
             text_box_list = merge_textbox(text_box_list, obj_in_table_list)
             text_box_list = merge_textbox(text_box_list, obj_in_table_list)
@@ -715,8 +797,10 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
             obj_list = []
             obj_list = []
             # print('table_list', table_list)
             # print('table_list', table_list)
             for table in table_list:
             for table in table_list:
-                _table = _Table(table["table"], table["bbox"])
-                obj_list.append(_table)
+                # print('type(table)', type(table))
+                # _table = _Table(table["table"], table["bbox"])
+                # print('table.bbox', table.bbox)
+                obj_list.append(table)
             for text_box in text_box_list:
             for text_box in text_box_list:
                 if text_box not in obj_in_table_list:
                 if text_box not in obj_in_table_list:
                     obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
                     obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
@@ -732,6 +816,690 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
         return [-1]
         return [-1]
 
 
 
 
+# class ImageProcess:
+#     def __init__(self, image_np, image_path, is_from_pdf=False, is_from_docx=False,
+#                  b_table_from_text=False, pdf_obj_list=[], pdf_layout_size=(),
+#                  is_reverse=False):
+#
+#         self.image_np = image_np
+#         self.image_path = image_path
+#         self.is_from_pdf = is_from_pdf
+#         self.is_from_docx = is_from_docx
+#         self.b_table_from_text = b_table_from_text
+#         self.pdf_obj_list = pdf_obj_list
+#         self.pdf_layout_size = pdf_layout_size
+#         self.is_reverse = is_reverse
+#
+#     def merge_textbox(self, textbox_list, in_objs):
+#         delete_obj = []
+#         threshold = 5
+#         textbox_list.sort(key=lambda x:x.bbox[0])
+#         for k in range(len(textbox_list)):
+#             tb1 = textbox_list[k]
+#             if tb1 not in in_objs and tb1 not in delete_obj:
+#                 for m in range(k+1, len(textbox_list)):
+#                     tb2 = textbox_list[m]
+#                     if tb2 in in_objs:
+#                         continue
+#                     if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
+#                             and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
+#                         if tb1.bbox[0] <= tb2.bbox[0]:
+#                             tb1.text = tb1.text + tb2.text
+#                         else:
+#                             tb1.text = tb2.text + tb1.text
+#                         tb1.bbox[0] = min(tb1.bbox[0], tb2.bbox[0])
+#                         tb1.bbox[2] = max(tb1.bbox[2], tb2.bbox[2])
+#                         delete_obj.append(tb2)
+#         for _obj in delete_obj:
+#             if _obj in textbox_list:
+#                 textbox_list.remove(_obj)
+#         return textbox_list
+#
+#     def resize_process(self, _image_np):
+#         # 整体分辨率限制
+#         threshold = 2048
+#         if _image_np.shape[0] > threshold or _image_np.shape[1] > threshold:
+#             h, w = get_best_predict_size2(_image_np, threshold=threshold)
+#             log("global image resize " + str(_image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
+#             _image_np = pil_resize(_image_np, h, w)
+#         return _image_np
+#
+#     def idc_process(self, _image_np, return_angle=False):
+#         # 图片倾斜校正,写入原来的图片路径
+#         # print("image_process", image_path)
+#         # g_r_i = get_rotated_image(_image_np, image_path)
+#         # if judge_error_code(g_r_i):
+#         #     if is_from_docx:
+#         #         return []
+#         #     else:
+#         #         return g_r_i
+#         # _image_np = cv2.imread(image_path)
+#         # if _image_np is None:
+#         #     return []
+#         # return _image_np
+#
+#         # if _image_np is None:
+#         #     return []
+#
+#         # idc模型实现图片倾斜校正
+#         h, w = get_best_predict_size2(_image_np, 1080)
+#         image_resize = pil_resize(_image_np, h, w)
+#         # image_resize_path = image_path.split(".")[0] + "_resize_idc." + image_path.split(".")[-1]
+#         # cv2.imwrite(image_resize_path, image_resize)
+#
+#         # with open(image_resize_path, "rb") as f:
+#         #     image_bytes = f.read()
+#         image_bytes = np2bytes(image_resize)
+#         angle = from_idc_interface(image_bytes)
+#         log('idc_process angle ' + str(angle))
+#         if judge_error_code(angle):
+#             if return_angle:
+#                 if self.is_from_docx:
+#                     return [], []
+#                 else:
+#                     return angle, angle
+#             else:
+#                 if self.is_from_docx:
+#                     return []
+#                 else:
+#                     return angle
+#         # 根据角度旋转
+#         # _image_pil = Image.fromarray(_image_np)
+#         # _image_np = np.array(_image_pil.rotate(angle, expand=1))
+#         _image_np = image_rotate(_image_np, angle)
+#
+#         # 写入
+#         # idc_path = image_path.split(".")[0] + "_idc." + image_path.split(".")[-1]
+#         # cv2.imwrite(idc_path, image_np)
+#         if return_angle:
+#             return _image_np, angle
+#         return _image_np
+#
+#     def isr_process(self, _image_np):
+#         log("isr_process image shape " + str(_image_np.shape))
+#         image_np_copy = copy.deepcopy(_image_np)
+#         # isr模型去除印章
+#         _isr_time = time.time()
+#         if count_red_pixel(_image_np):
+#             # 红色像素达到一定值才过模型
+#             image_bytes = np2bytes(_image_np)
+#             _image_np = from_isr_interface(image_bytes)
+#             if judge_error_code(_image_np):
+#                 if self.is_from_docx:
+#                     return []
+#                 else:
+#                     return _image_np
+#             # [1]代表检测不到印章,直接返回
+#             if isinstance(_image_np, list) and _image_np == [1]:
+#                 log("no seals detected!")
+#                 _image_np = image_np_copy
+#         log("isr total time "+str(time.time()-_isr_time))
+#         return _image_np
+#
+#     def ocr_process(self, _image_np, _threshold=2048):
+#         log("ocr_process image shape " + str(_image_np.shape))
+#
+#         # ocr图片过大内存溢出,需resize
+#         # 大图按比例缩小,小图维持不变;若统一拉伸成固定大小如1024会爆显存
+#         ratio = (1, 1)
+#         h, w = _image_np.shape[:2]
+#         if _image_np.shape[0] > _threshold or _image_np.shape[1] > _threshold:
+#             best_h, best_w = get_best_predict_size2(_image_np, _threshold)
+#             _image_np = pil_resize(_image_np, best_h, best_w)
+#             log("ocr_process image resize " + str(_image_np.shape))
+#             ratio = (h/best_h, w/best_w)
+#
+#         # 大图片ocr加锁,防止爆显存
+#         # if _image_np.shape[0] >= 1024 and _image_np.shape[1] >= 1024:
+#         #     file_lock = True
+#         # else:
+#         #     file_lock = False
+#
+#         # 调用ocr模型接口
+#         image_bytes = np2bytes(_image_np)
+#         text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
+#         if judge_error_code(text_list):
+#             return text_list, text_list
+#
+#         for i in range(len(bbox_list)):
+#             point = bbox_list[i]
+#             bbox_list[i] = [[int(point[0][0]*ratio[0]), int(point[0][1]*ratio[1])],
+#                             [int(point[1][0]*ratio[0]), int(point[1][1]*ratio[1])],
+#                             [int(point[2][0]*ratio[0]), int(point[2][1]*ratio[1])],
+#                             [int(point[3][0]*ratio[0]), int(point[3][1]*ratio[1])]]
+#
+#         # 去除水印字 根据识别是否为矩形框
+#         temp_text_list = []
+#         temp_bbox_list = []
+#         water_mark_dict = {}
+#         for i in range(len(bbox_list)):
+#             bbox = bbox_list[i]
+#             text = text_list[i]
+#             if len(re.findall('[\u4e00-\u9fa5]', text)) == len(text):
+#                 if (abs(bbox[0][1] - bbox[1][1]) <= 2 and abs(bbox[2][1] - bbox[3][1]) <= 2) \
+#                         or (abs(bbox[0][0] - bbox[3][0]) <= 4 and abs(bbox[2][0] - bbox[1][0]) <= 4):
+#                     temp_text_list.append(text)
+#                     temp_bbox_list.append(bbox)
+#                 else:
+#                     if text in water_mark_dict.keys():
+#                         water_mark_dict[text] += [bbox]
+#                     else:
+#                         water_mark_dict[text] = [bbox]
+#             else:
+#                 temp_text_list.append(text)
+#                 temp_bbox_list.append(bbox)
+#
+#         # 数量多的才算水印
+#         for text in water_mark_dict.keys():
+#             bbox_list = water_mark_dict.get(text)
+#             if len(bbox_list) < 3:
+#                 for bbox in bbox_list:
+#                     temp_text_list.append(text)
+#                     temp_bbox_list.append(bbox)
+#
+#         text_list = temp_text_list
+#         bbox_list = temp_bbox_list
+#         return text_list, bbox_list
+#
+#     def otr_process(self, _image_np):
+#         log("otr_process image shape " + str(_image_np.shape))
+#         # otr模型识别表格,需要图片resize成模型所需大小, 写入另一个路径
+#         best_h, best_w = get_best_predict_size(_image_np)
+#         image_resize = pil_resize(_image_np, best_h, best_w)
+#         # image_resize_path = image_path.split(".")[0] + "_resize_otr." + image_path.split(".")[-1]
+#         # cv2.imwrite(image_resize_path, image_resize)
+#
+#         # 调用otr模型接口
+#         # with open(image_resize_path, "rb") as f:
+#         #     image_bytes = f.read()
+#         image_bytes = np2bytes(image_resize)
+#         list_line = from_otr_interface(image_bytes, self.is_from_pdf)
+#         if judge_error_code(list_line):
+#             if self.is_from_docx:
+#                 return []
+#             else:
+#                 return list_line
+#
+#         # otr resize后得到的bbox根据比例还原
+#         start_time = time.time()
+#         ratio = (_image_np.shape[0]/best_h, _image_np.shape[1]/best_w)
+#         for i in range(len(list_line)):
+#             point = list_line[i]
+#             list_line[i] = [int(point[0]*ratio[1]), int(point[1]*ratio[0]),
+#                             int(point[2]*ratio[1]), int(point[3]*ratio[0])]
+#         log("otr resize bbox recover " + str(time.time()-start_time))
+#         return list_line
+#
+#     def botr_process(self, _image_np, table_list2, text_list2, box_list2, text_box_list2, obj_in_table_list2,
+#                      from_pdf=False, pdf_obj_list=[], pdf_layout_size=()):
+#         if from_pdf:
+#             # 交叉验证 ocr结果与pdf obj,暂时使用pdf提取的
+#             h_ratio = _image_np.shape[0] / pdf_layout_size[1]
+#             w_ratio = _image_np.shape[1] / pdf_layout_size[0]
+#             pdf_text_list = []
+#             pdf_box_list = []
+#             for obj in pdf_obj_list:
+#                 if obj.get_text() in ["", " "]:
+#                     continue
+#
+#                 # pdf坐标是上下颠倒的
+#                 # obj.bbox = (obj.bbox[0], pdf_layout_size[1]-obj.bbox[3],
+#                 #             obj.bbox[2], pdf_layout_size[1]-obj.bbox[1])
+#
+#                 # 根据两个页面大小比例调整坐标
+#                 obj.bbox = (obj.bbox[0]*w_ratio, obj.bbox[1]*h_ratio,
+#                             obj.bbox[2]*w_ratio, obj.bbox[3]*h_ratio)
+#
+#                 # 剔除水印字
+#                 text = re.sub('[\n ]', '', obj.get_text())
+#                 if len(text) == 1 and abs(obj.bbox[0] - obj.bbox[2]) >= 70:
+#                     continue
+#
+#                 pdf_box_list.append([[int(obj.bbox[0]), int(obj.bbox[1])],
+#                                      [],
+#                                      [int(obj.bbox[2]), int(obj.bbox[3])],
+#                                      []
+#                                      ])
+#                 pdf_text_list.append(re.sub('[\n]', '', obj.get_text()))
+#
+#             pdf_text_box_list = self.get_text_box_obj(pdf_text_list, pdf_box_list)
+#
+#             text_list2 = pdf_text_list
+#             box_list2 = pdf_box_list
+#             text_box_list2 = pdf_text_box_list
+#
+#         _text_box_list, _table_list, _obj_in_table_list = get_table(_image_np, table_list2, text_list2, box_list2, text_box_list2, from_pdf=from_pdf)
+#
+#         # 保存无边框表格文件
+#         if _table_list:
+#             try:
+#                 self.save_b_table(_image_np, text_box_list2, from_pdf)
+#             except:
+#                 pass
+#
+#         # print('_text_box_list', _text_box_list)
+#         # print('_table_list', _table_list)
+#         if from_pdf:
+#             text_box_list2 = []
+#             table_list2 = []
+#
+#         if _table_list and _text_box_list:
+#             text_box_list2 += _text_box_list
+#             text_box_list2 = list(set(text_box_list2))
+#             # table_list2 += _table_list
+#             # obj_in_table_list2 = obj_in_table_list2.union(_obj_in_table_list)
+#         return text_box_list2, _table_list, _obj_in_table_list
+#
+#     def table_process(self, list_line, list_text_boxes, _image_np):
+#         # 调用现成方法形成表格
+#         try:
+#             if list_line:
+#
+#                 # 排除掉短且经过文字bbox中间的竖线
+#                 temp_list = []
+#                 for line in list_line:
+#                     find_cnt = 0
+#                     if abs(line[0]-line[2]) < abs(line[1]-line[3]) and abs(line[1] - line[3]) <= _image_np.shape[0] / 20:
+#                         for t_obj in list_text_boxes:
+#                             # if not (t_obj.bbox[1] <= line[1] <= t_obj.bbox[3] or t_obj.bbox[1] <= line[3] <= t_obj.bbox[3]):
+#                             #     continue
+#                             if line_iou([[t_obj.bbox[1], 0], [t_obj.bbox[3], 0]], [[line[1], 0], [line[3], 0]]) < 0.3:
+#                                 continue
+#                             if abs(t_obj.bbox[0]-t_obj.bbox[2])/5 + min(t_obj.bbox[0], t_obj.bbox[2]) <= line[0] <= abs(t_obj.bbox[0]-t_obj.bbox[2])/5*4 + min(t_obj.bbox[0], t_obj.bbox[2]) and (t_obj.bbox[0]-t_obj.bbox[2]) <= 60:
+#                                 # print('match', line[0], t_obj.bbox[0], t_obj.bbox[2], t_obj.get_text())
+#                                 find_cnt += 1
+#                                 if find_cnt >= 2:
+#                                     break
+#                     if find_cnt >= 2:
+#                         continue
+#                     temp_list.append(line)
+#                 list_line = temp_list
+#
+#                 from format_convert.convert_tree import TableLine
+#                 list_lines = []
+#                 for line in list_line:
+#                     list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
+#
+#                 lt = LineTable()
+#                 tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
+#                                                                                    sourceP_LB=False, splited=False,
+#                                                                                    from_pdf=self.is_from_pdf,
+#                                                                                    is_reverse=self.is_reverse)
+#                 # 需分割textbox
+#                 if connect_textbox_list:
+#                     list_text_boxes = self.table_textbox_split(_image_np, connect_textbox_list, list_text_boxes)
+#                     # 新的textbox,重新做表格
+#                     tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
+#                                                                                        sourceP_LB=False, splited=True,
+#                                                                                        from_pdf=self.is_from_pdf,
+#                                                                                        is_reverse=self.is_reverse)
+#
+#                 if not tables:
+#                     return list_text_boxes, tables, obj_in_table
+#                 return list_text_boxes, tables, obj_in_table
+#             else:
+#                 return list_text_boxes, [], set()
+#         except:
+#             traceback.print_exc()
+#             return [-8], [-8], [-8]
+#
+#     def slice_process(self, _image_np):
+#         slice_flag = need_image_slice(_image_np)
+#         log("need_image_slice " + str(slice_flag) + " " + str(_image_np.shape))
+#         _image_np_list = [_image_np]
+#         if slice_flag:
+#             # 长图分割
+#             _image_np_list = image_slice_new(_image_np)
+#             angle_dict = {}
+#             for im in _image_np_list:
+#                 _, angle = self.idc_process(im, return_angle=True)
+#                 if angle in [0, 360]:
+#                     angle = 0
+#                 if angle in angle_dict.keys():
+#                     angle_dict[angle] += 1
+#                 else:
+#                     angle_dict[angle] = 1
+#
+#             # idc不太准,有0度就直接使用
+#             if 0 in angle_dict.keys():
+#                 log('image_slice 0 in angle_dict')
+#                 angle = 0
+#             else:
+#                 angle_list = [[key, value] for key, value in angle_dict.items()]
+#                 angle_list.sort(key=lambda x: x[1])
+#                 log('image_slice angle_list ' + str(angle_list))
+#                 angle = angle_list[-1][0]
+#             for i in range(len(_image_np_list)):
+#                 _image_np_list[i] = image_rotate(_image_np_list[i], angle)
+#             if angle in [180]:
+#                 _image_np_list.reverse()
+#
+#         if len(_image_np_list) < 1:
+#             log("image_slice failed!")
+#             _image_np_list = [_image_np]
+#         return _image_np_list
+#
+#     def get_text_box_obj(self, _text_list, _bbox_list):
+#         from format_convert.convert_tree import TextBox
+#         _text_box_list = []
+#         for i in range(len(_bbox_list)):
+#             bbox = _bbox_list[i]
+#             b_text = _text_list[i]
+#             _text_box_list.append(TextBox([bbox[0][0], bbox[0][1],
+#                                            bbox[2][0], bbox[2][1]], b_text))
+#         return _text_box_list
+#
+#     def save_b_table(self, image_np2, text_box_list2, from_pdf=False):
+#         _start_time = time.time()
+#         _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table'
+#         # _path = 'D:/Project/format_conversion_maxcompute/save_b_table'
+#         max_index = 20000
+#         if os.path.exists(_path):
+#             file_list = glob(_path + '/*')
+#             if file_list:
+#                 file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
+#                 file_index_list.sort(key=lambda x: x)
+#                 index = file_index_list[-1] + 1
+#             else:
+#                 index = 0
+#             if index > max_index:
+#                 return
+#
+#             # 文件md5
+#             from format_convert import _global
+#             _md5 = _global.get("md5")
+#
+#             _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png'
+#             cv2.imwrite(_image_path, image_np2)
+#             log('save b_table image success!')
+#
+#             # if from_pdf:
+#             #     _file_path = _path + '/' + str(_md5) + '-' + str(index) + '.txt'
+#             #     new_text_box_list2 = [str(x) + '\n' for x in text_box_list2]
+#             #     with open(_file_path, 'w') as f:
+#             #         f.writelines(new_text_box_list2)
+#             #     log('save b_table txt success!')
+#
+#         log('save_b_table cost: ' + str(time.time()-_start_time))
+#
+#     def table_textbox_split(self, image_np2, connect_textbox_list, textbox_list):
+#         """
+#         两个单元格里的文本被ocr识别为一个,需分开才能准确放进表格
+#
+#         :return:
+#         """
+#         split_bbox_list = []
+#         split_text_list = []
+#         splited_textbox_list = []
+#         for textbox in connect_textbox_list:
+#             bbox = textbox.bbox
+#             bbox = [[bbox[0], bbox[1]], [], [bbox[2], bbox[3]], []]
+#             sub_image_np = image_np2[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
+#             split_index_list = []
+#             # 从左到右遍历img
+#             for i in range(5, sub_image_np.shape[1]-5):
+#                 # 找表格分割线,这一列都为黑色像素
+#                 if np.where(sub_image_np[:, i, 0] < 200)[0].size >= sub_image_np.shape[0]:
+#                     split_index_list.append(i)
+#
+#             # 判断两线之间宽度,去重
+#             if len(split_index_list) > 1:
+#                 last_index = split_index_list[0]
+#                 temp_list = []
+#                 delete_list = []
+#                 for index in split_index_list[1:]:
+#                     if index in delete_list:
+#                         continue
+#                     if index - last_index <= 5:
+#                         delete_list.append(index)
+#                     else:
+#                         last_index = index
+#                     temp_list.append(last_index)
+#                 split_index_list = temp_list
+#
+#             # n条以上分割线,有问题
+#             if len(split_index_list) == 0 or len(split_index_list) >= 2:
+#                 # print('len(split_index_list)', len(split_index_list), split_index_list)
+#                 continue
+#             else:
+#                 # 根据index拆开图片,重新ocr
+#                 split_index_list.insert(0, 0)
+#                 print('split_index_list1', split_index_list)
+#                 for _i, index in enumerate(split_index_list):
+#                     if _i == len(split_index_list) - 1:
+#                         split_image_np = sub_image_np[:, index:, :]
+#                         split_bbox_list.append([[bbox[0][0]+index, bbox[0][1]], [], [bbox[2][0], bbox[2][1]], []])
+#                     else:
+#                         next_index = split_index_list[_i+1]
+#                         split_image_np = sub_image_np[:, index:next_index, :]
+#                         split_bbox_list.append([[bbox[0][0]+index, bbox[0][1]], [], [bbox[0][0]+next_index, bbox[2][1]], []])
+#
+#                     # ocr
+#                     split_image_bytes = np2bytes(split_image_np)
+#                     text_list2, bbox_list2 = from_ocr_interface(split_image_bytes, is_table=1, only_rec=1)
+#                     # print('text_list2', text_list2)
+#                     # print('bbox_list2', split_bbox_list)
+#                     if judge_error_code(text_list2):
+#                         text2 = ''
+#                     else:
+#                         if text_list2:
+#                             text2 = text_list2[0]
+#                         else:
+#                             text2 = ''
+#                     split_text_list.append(text2)
+#                 splited_textbox_list.append(textbox)
+#
+#         if split_text_list and split_bbox_list:
+#             split_textbox_list = self.get_text_box_obj(split_text_list, split_bbox_list)
+#             for tb in splited_textbox_list:
+#                 if tb in textbox_list:
+#                     textbox_list.remove(tb)
+#             textbox_list += split_textbox_list
+#
+#         return textbox_list
+#
+#     def __call__(self):
+#         from format_convert.convert_tree import _Table, _Sentence
+#         log("into image_preprocess")
+#         try:
+#             if self.image_np is None:
+#                 log("image_preprocess image_np is None")
+#                 return []
+#             if self.image_np.shape[0] <= 20 or self.image_np.shape[1] <= 20:
+#                 log('image_np.shape[0] <= 20 or image_np.shape[1] <= 20')
+#                 return []
+#
+#             if not self.b_table_from_text:
+#                 # 判断是否需要长图分割
+#                 idc_flag = False
+#                 image_np_list = self.slice_process(self.image_np)
+#                 if len(image_np_list) > 1:
+#                     idc_flag = True
+#
+#                 reverse_flag = 0
+#                 table_textbox_list = []
+#                 for image_np in image_np_list:
+#                     # 整体分辨率限制
+#                     image_np = self.resize_process(image_np)
+#
+#                     # 印章去除
+#                     image_np = self.isr_process(image_np)
+#                     if isinstance(image_np, list):
+#                         return image_np
+#
+#                     # 文字识别
+#                     text_list, box_list = self.ocr_process(image_np)
+#                     if judge_error_code(text_list):
+#                         return text_list
+#
+#                     # 判断ocr识别是否正确
+#                     # print('ocr_cant_read(text_list, box_list)', ocr_cant_read(text_list, box_list), idc_flag, text_list)
+#                     if ocr_cant_read(text_list, box_list) and not idc_flag:
+#                         # 方向分类
+#                         image_np, angle = self.idc_process(image_np, return_angle=True)
+#                         if isinstance(image_np, list):
+#                             return image_np
+#                         # 如果角度不变,旋转180
+#                         if angle in [0, 360]:
+#                             pass
+#                             # log('ocr_cant_read image_rotate 180')
+#                             # image_np = image_rotate(image_np, angle=180)
+#                             # reverse_flag = 1
+#                             # image_pil = Image.fromarray(image_np)
+#                             # image_np = np.array(image_pil.rotate(180, expand=1))
+#                         # cv2.imshow("idc_process", image_np)
+#                         # cv2.waitKey(0)
+#
+#                         # 文字识别
+#                         text_list1, box_list_1 = self.ocr_process(image_np)
+#                         if judge_error_code(text_list1):
+#                             return text_list1
+#
+#                         if len(text_list1) > 0 and ocr_cant_read(text_list1, box_list_1) and self.is_from_pdf:
+#                             return [-16]
+#
+#                         # 比较字数
+#                         # print("ocr process", len("".join(text_list)), len("".join(text_list1)))
+#                         if len("".join(text_list)) < len("".join(text_list1)):
+#                             text_list = text_list1
+#                             box_list = box_list_1
+#
+#                     # 表格识别
+#                     line_list = self.otr_process(image_np)
+#                     if judge_error_code(line_list):
+#                         return line_list
+#
+#                     # 生成TextBox对象
+#                     text_box_list = self.get_text_box_obj(text_list, box_list)
+#                     # for t in text_box_list:
+#                     #     print('text_box0', t.get_text())
+#
+#                     # 表格生成
+#                     text_box_list, table_list, obj_in_table_list = self.table_process(line_list, text_box_list, image_np)
+#                     # for t in text_box_list:
+#                     #     print('text_box1', t.get_text())
+#                     # print('table_list', table_list)
+#                     # for t in obj_in_table_list:
+#                     #     print('obj_text_box2', t.get_text())
+#                     if judge_error_code(table_list):
+#                         return table_list
+#
+#                     # 无边框表格识别
+#                     start_time = time.time()
+#                     text_box_list, b_table_list, b_obj_in_table_list \
+#                         = self.botr_process(image_np, table_list, text_list, box_list,
+#                                             text_box_list, obj_in_table_list, self.b_table_from_text,
+#                                             self.pdf_obj_list, self.pdf_layout_size,
+#                                             )
+#                     log('botr process cost: ' + str(time.time()-start_time))
+#
+#                     # 合并非表格的同一行TextBox
+#                     text_box_list = self.merge_textbox(text_box_list, obj_in_table_list)
+#
+#                     table_textbox_list.append([table_list, b_table_list, obj_in_table_list, text_box_list])
+#
+#                 if reverse_flag:
+#                     table_textbox_list.reverse()
+#
+#                     for i in range(len(image_np_list)):
+#                         image_np_list[i] = image_rotate(image_np_list[i], angle=180)
+#                     image_np_list.reverse()
+#
+#                 # index = 0
+#                 # for image_np in image_np_list:
+#                 #     cv2.imshow(str(index) + '.jpg', image_np)
+#                 #     cv2.waitKey(0)
+#                 #     index += 1
+#
+#                 # 对象生成
+#                 all_obj_list = []
+#                 _add_y = 0
+#                 for table_list, b_table_list, obj_in_table_list, text_box_list in table_textbox_list:
+#                     obj_list = []
+#                     for table in table_list:
+#                         _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
+#                         _table = _Table(table["table"], _table_bbox)
+#                         obj_list.append(_table)
+#                     for table in b_table_list:
+#                         _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
+#                         _table = _Table(table["table"], _table_bbox)
+#                         obj_list.append(_table)
+#                     for text_box in text_box_list:
+#                         if text_box not in obj_in_table_list:
+#                             text_box.bbox[1] += _add_y
+#                             obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
+#
+#                     # 多图修正y
+#                     if len(image_np_list) > 1:
+#                         list_y = []
+#                         for obj in obj_list:
+#                             obj.y += _add_y
+#                             list_y.append(obj.y)
+#                         if len(list_y) > 0:
+#                             _add_y += max(list_y)
+#
+#                     # 合并
+#                     all_obj_list += obj_list
+#
+#             # 无边框表格图片
+#             else:
+#                 all_obj_list = []
+#                 table_list = []
+#                 text_list = []
+#                 box_list = []
+#                 text_box_list = []
+#                 obj_in_table_list = set()
+#
+#                 # 表格识别
+#                 line_list = self.otr_process(self.image_np)
+#                 if judge_error_code(line_list):
+#                     return line_list
+#
+#                 # 生成TextBox对象
+#                 text_box_list = self.get_text_box_obj(text_list, box_list)
+#
+#                 # 表格生成
+#                 text_box_list, table_list, obj_in_table_list = self.table_process(line_list, text_box_list, self.image_np)
+#                 if judge_error_code(table_list):
+#                     return table_list
+#
+#                 # 无边框表格识别
+#                 start_time = time.time()
+#                 text_box_list, table_list, obj_in_table_list \
+#                     = self.botr_process(self.image_np, table_list,
+#                                         text_list, box_list,
+#                                                                             text_box_list,
+#                                                                             obj_in_table_list,
+#                                         self.b_table_from_text,
+#                                         self.pdf_obj_list,
+#                                         self.pdf_layout_size,
+#                                                                             )
+#                 log('botr process cost: ' + str(time.time()-start_time))
+#
+#                 # 合并非表格的同一行TextBox
+#                 text_box_list = self.merge_textbox(text_box_list, obj_in_table_list)
+#
+#                 # 对象生成
+#                 obj_list = []
+#                 # print('table_list', table_list)
+#                 for table in table_list:
+#                     _table = _Table(table["table"], table["bbox"])
+#                     obj_list.append(_table)
+#                 for text_box in text_box_list:
+#                     if text_box not in obj_in_table_list:
+#                         obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
+#
+#                 # 合并
+#                 all_obj_list += obj_list
+#
+#             return all_obj_list
+#
+#         except Exception as e:
+#             log("image_preprocess error")
+#             traceback.print_exc()
+#             return [-1]
+
+
 @memory_decorator
 @memory_decorator
 def picture2text(path, html=False):
 def picture2text(path, html=False):
     log("into picture2text")
     log("into picture2text")
@@ -786,6 +1554,21 @@ def get_best_predict_size2(image_np, threshold=3000):
     return h, w
     return h, w
 
 
 
 
+def get_best_predict_size_by_area(image_np, threshold=1280):
+    max_area = threshold*threshold
+    height, width = image_np.shape[:2]
+    area = height * width
+
+    if area <= max_area:
+        return height, width
+
+    # 计算缩放比例
+    scale = (max_area / area) ** 0.5
+    new_width = int(width * scale)
+    new_height = int(height * scale)
+    return new_height, new_width
+
+
 def image_slice(image_np):
 def image_slice(image_np):
     """
     """
     slice the image if the height is to large
     slice the image if the height is to large
@@ -1269,6 +2052,17 @@ def image_process_old(image_np, image_path, is_from_pdf=False, is_from_docx=Fals
 
 
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    img111 = cv2.imread("C:/Users/Administrator/Downloads/1724146601927.png")
-    cv2.imshow('111', img111)
-    cv2.waitKey(0)
+    # _pp = r'D:\Project\format_conversion_maxcompute\save_b_table' \
+    #       r'\211-6591070e1cc8ea6904ba00a0a3d6c32f.png'
+    _pp = r'C:\Users\Administrator\Desktop\test_b_table\error7.png'
+    save_pp = r'D:\Project\format_conversion_maxcompute\format_convert\temp\test_convert_image.jpg'
+    # img111 = cv2.imread(_pp)
+    # img111 = pil_resize(img111, 1024, 768)
+    # cv2.imwrite(save_pp, img111)
+    # image_process(img111, '')
+    # cv2.imshow('111', img111)
+    # cv2.waitKey(0)
+
+    _html = ImageConvert(_pp, r"D:\Project\format_conversion_maxcompute\format_convert\temp").get_html()
+    with open('../result.html', 'w', encoding='utf-8') as f:
+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])

+ 26 - 9
format_convert/convert_need_interface.py

@@ -144,6 +144,7 @@ def from_office_interface_240606(src_path, dest_path, target_format, retry_times
 
 
 
 
 def from_office_interface(src_path, dest_path, target_format, retry_times=1, from_remote=FROM_REMOTE):
 def from_office_interface(src_path, dest_path, target_format, retry_times=1, from_remote=FROM_REMOTE):
+    start_time = time.time()
     try:
     try:
         if from_remote:
         if from_remote:
             # 重试
             # 重试
@@ -200,6 +201,8 @@ def from_office_interface(src_path, dest_path, target_format, retry_times=1, fro
         log("from_office_interface error!")
         log("from_office_interface error!")
         traceback.print_exc()
         traceback.print_exc()
         return [-1]
         return [-1]
+    finally:
+        log("from_office_interface cost time " + str(time.time()-start_time))
 
 
 
 
 def from_tika_interface(src_path, from_remote=FROM_REMOTE):
 def from_tika_interface(src_path, from_remote=FROM_REMOTE):
@@ -239,17 +242,21 @@ def from_tika_interface(src_path, from_remote=FROM_REMOTE):
             return [-2]
             return [-2]
 
 
         _dict = r
         _dict = r
-        html = _dict.get("html")
-        log("from_tika_interface cost time " + str(time.time()-start_time))
-        return html
+        data = _dict.get("data")
+
+        return data
     except Exception as e:
     except Exception as e:
         log("from_tika_interface error!")
         log("from_tika_interface error!")
         traceback.print_exc()
         traceback.print_exc()
         return [-11]
         return [-11]
+    finally:
+        log("from_tika_interface cost time " + str(time.time()-start_time))
 
 
 
 
 def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_REMOTE):
 def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_REMOTE):
     log("into from_ocr_interface")
     log("into from_ocr_interface")
+    # print('FROM_REMOTE', FROM_REMOTE)
+    start_time = time.time()
     try:
     try:
         base64_stream = base64.b64encode(image_stream)
         base64_stream = base64.b64encode(image_stream)
 
 
@@ -281,7 +288,10 @@ def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_RE
                             log("retry post ocr_interface... left times " + str(retry_times_1))
                             log("retry post ocr_interface... left times " + str(retry_times_1))
                             continue
                             continue
                     if judge_error_code(r):
                     if judge_error_code(r):
-                        return r
+                        if is_table:
+                            return r, r
+                        else:
+                            return r
                     break
                     break
             else:
             else:
                 if globals().get("global_ocr_model") is None:
                 if globals().get("global_ocr_model") is None:
@@ -326,6 +336,8 @@ def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_RE
             return [-1], [-1]
             return [-1], [-1]
         else:
         else:
             return [-1]
             return [-1]
+    finally:
+        log("from_ocr_interface cost time " + str(time.time()-start_time))
 
 
 
 
 def from_gpu_interface_redis(_dict, model_type, predictor_type):
 def from_gpu_interface_redis(_dict, model_type, predictor_type):
@@ -366,6 +378,7 @@ def from_gpu_interface_redis(_dict, model_type, predictor_type):
 
 
 def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE):
 def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE):
     log("into from_otr_interface")
     log("into from_otr_interface")
+    start_time = time.time()
     try:
     try:
         base64_stream = base64.b64encode(image_stream)
         base64_stream = base64.b64encode(image_stream)
 
 
@@ -424,6 +437,8 @@ def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE)
         log("from_otr_interface error!")
         log("from_otr_interface error!")
         print("from_otr_interface", traceback.print_exc())
         print("from_otr_interface", traceback.print_exc())
         return [-1]
         return [-1]
+    finally:
+        log("from_otr_interface cost time " + str(time.time()-start_time))
 
 
 
 
 def from_isr_interface(image_stream, from_remote=FROM_REMOTE):
 def from_isr_interface(image_stream, from_remote=FROM_REMOTE):
@@ -487,7 +502,6 @@ def from_isr_interface(image_stream, from_remote=FROM_REMOTE):
             image_np = cv2.imdecode(buffer, 1)
             image_np = cv2.imdecode(buffer, 1)
         else:
         else:
             image_np = _dict.get("image")
             image_np = _dict.get("image")
-        log("from_isr_interface cost time " + str(time.time()-start_time))
         return image_np
         return image_np
     except Exception as e:
     except Exception as e:
         log("from_isr_interface error!")
         log("from_isr_interface error!")
@@ -495,7 +509,7 @@ def from_isr_interface(image_stream, from_remote=FROM_REMOTE):
         return [-11]
         return [-11]
     finally:
     finally:
         # os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
         # os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
-        pass
+        log("from_isr_interface cost time " + str(time.time()-start_time))
 
 
 
 
 def from_idc_interface(image_stream, from_remote=FROM_REMOTE):
 def from_idc_interface(image_stream, from_remote=FROM_REMOTE):
@@ -543,12 +557,13 @@ def from_idc_interface(image_stream, from_remote=FROM_REMOTE):
 
 
         _dict = r
         _dict = r
         angle = _dict.get("angle")
         angle = _dict.get("angle")
-        log("from_idc_interface cost time " + str(time.time()-start_time))
         return angle
         return angle
     except Exception as e:
     except Exception as e:
         log("from_idc_interface error!")
         log("from_idc_interface error!")
         traceback.print_exc()
         traceback.print_exc()
         return [-11]
         return [-11]
+    finally:
+        log("from_idc_interface cost time " + str(time.time()-start_time))
 
 
 
 
 def from_atc_interface(text, from_remote=FROM_REMOTE):
 def from_atc_interface(text, from_remote=FROM_REMOTE):
@@ -594,12 +609,13 @@ def from_atc_interface(text, from_remote=FROM_REMOTE):
 
 
         _dict = r
         _dict = r
         classification = _dict.get("classification")
         classification = _dict.get("classification")
-        log("from_atc_interface cost time " + str(time.time()-start_time))
         return classification
         return classification
     except Exception as e:
     except Exception as e:
         log("from_atc_interface error!")
         log("from_atc_interface error!")
         traceback.print_exc()
         traceback.print_exc()
         return [-11]
         return [-11]
+    finally:
+        log("from_atc_interface cost time " + str(time.time()-start_time))
 
 
 
 
 def from_yolo_interface(image_stream, from_remote=FROM_REMOTE):
 def from_yolo_interface(image_stream, from_remote=FROM_REMOTE):
@@ -652,12 +668,13 @@ def from_yolo_interface(image_stream, from_remote=FROM_REMOTE):
 
 
         _dict = r
         _dict = r
         b_table_list = _dict.get("b_table_list")
         b_table_list = _dict.get("b_table_list")
-        log("from_yolo_interface cost time " + str(time.time()-start_time))
         return b_table_list
         return b_table_list
     except Exception as e:
     except Exception as e:
         log("from_yolo_interface error!")
         log("from_yolo_interface error!")
         traceback.print_exc()
         traceback.print_exc()
         return [-11]
         return [-11]
+    finally:
+        log("from_yolo_interface cost time " + str(time.time()-start_time))
 
 
 
 
 def interface_pool_gunicorn(interface_type):
 def interface_pool_gunicorn(interface_type):

+ 75 - 0
format_convert/convert_ofd.py

@@ -0,0 +1,75 @@
+import base64
+import os
+import re
+import sys
+import time
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
+from format_convert.easyofd.easyofd.ofd import OFD
+from format_convert.convert_tree import _Document, _Sentence, _Page
+import logging
+import traceback
+from format_convert.convert_pdf import PDFConvert
+from format_convert.utils import judge_error_code, get_logger, log
+
+
+class OfdConvert:
+    def __init__(self, path, unique_type_dir):
+        self._doc = _Document(path)
+        self.path = path
+        self.unique_type_dir = unique_type_dir
+        self.ofd = OFD()  # 初始化OFD 工具类
+
+    def convert(self):
+        start_time = time.time()
+        file_prefix = os.path.splitext(os.path.split(self.path)[1])[0]
+
+        with open(self.path, "rb") as f:
+            ofd_b64 = str(base64.b64encode(f.read()), "utf-8")
+
+        self.ofd.read(ofd_b64, save_xml=False, xml_name=f"{file_prefix}_xml",
+                      save_dir=self.unique_type_dir)  # 读取ofdb64
+        # print("ofd.data", ofd.data) # ofd.data 为程序解析结果
+        pdf_bytes, page_need_to_image_dict = self.ofd.to_pdf(return_need_convert_as_image=True)  # 转pdf
+        log('ofd to pdf cost: ' + str(time.time()-start_time))
+        # print('page_need_to_image_dict', page_need_to_image_dict)
+
+        self.ofd.del_data()
+
+        file_name = re.split('[/\\\]', self.path)[-1]
+        new_path = self.unique_type_dir + file_name[:-4] + '.pdf'
+
+        with open(new_path, "wb") as f:
+            f.write(pdf_bytes)
+        log('odf to pdf path ' + new_path + ' cost: ' + str(time.time()-start_time))
+
+        # 用pdf提取
+        self._pdf = PDFConvert(new_path, self.unique_type_dir, need_page_no=None,
+                               page_need_to_image_dict=page_need_to_image_dict)
+        # self._pdf.convert()
+        # self._doc = self._pdf._doc
+
+    def get_html(self):
+        try:
+            self.convert()
+        except:
+            traceback.print_exc()
+            self._doc.error_code = [-1]
+
+        # 直接返回pdf处理的html
+        if self._doc.error_code is not None:
+            return self._doc.error_code
+        else:
+            return self._pdf.get_html()
+
+
+if __name__ == '__main__':
+    _p = "C:/Users/Administrator/Downloads/0c71fe77-f052-414d-8189-3e8cb4f2a607.ofd"
+    p = '../1750060386706.ofd'
+    # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp\2" + '/'
+    c = OfdConvert(_p, save_dir)
+    _html = c.get_html()
+    with open('../result.html', 'w', encoding='utf-8') as f:
+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])
+
+

+ 75 - 0
format_convert/convert_ofd_test.py

@@ -0,0 +1,75 @@
+import base64
+import os
+import re
+import sys
+import time
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
+
+from format_convert.utils import judge_error_code, get_logger, log, register_all_fonts
+# register_all_fonts("/usr/share/fonts/")
+
+from format_convert.easyofd.easyofd.ofd import OFD
+from format_convert.convert_tree import _Document, _Sentence, _Page
+import logging
+import traceback
+from format_convert.convert_pdf import PDFConvert
+
+
+class OfdConvert:
+    def __init__(self, path, unique_type_dir):
+        self._doc = _Document(path)
+        self.path = path
+        self.unique_type_dir = unique_type_dir
+        self.ofd = OFD()  # 初始化OFD 工具类
+
+    def convert(self):
+        start_time = time.time()
+        file_prefix = os.path.splitext(os.path.split(self.path)[1])[0]
+
+        with open(self.path, "rb") as f:
+            ofd_b64 = str(base64.b64encode(f.read()), "utf-8")
+
+        self.ofd.read(ofd_b64, save_xml=False, xml_name=f"{file_prefix}_xml",
+                      save_dir=self.unique_type_dir)  # 读取ofdb64
+        # print("ofd.data", ofd.data) # ofd.data 为程序解析结果
+        pdf_bytes = self.ofd.to_pdf()  # 转pdf
+
+        self.ofd.del_data()
+
+        file_name = re.split('[/\\\]', self.path)[-1]
+        new_path = self.unique_type_dir + file_name[:-4] + '.pdf'
+
+        with open(new_path, "wb") as f:
+            f.write(pdf_bytes)
+        log('odf to pdf path ' + new_path + ' cost: ' + str(time.time()-start_time))
+
+        # 用pdf提取
+        self._pdf = PDFConvert(new_path, self.unique_type_dir, need_page_no=None)
+        # _pdf.convert()
+        # self._doc = _pdf._doc
+
+    def get_html(self):
+        try:
+            self.convert()
+        except:
+            traceback.print_exc()
+            self._doc.error_code = [-1]
+
+        # 直接返回doc处理的html
+        if self._doc.error_code is not None:
+            return self._doc.error_code
+        else:
+            return self._pdf.get_html()
+
+
+if __name__ == '__main__':
+    _p = "C:/Users/Administrator/Downloads/0c71fe77-f052-414d-8189-3e8cb4f2a607.ofd"
+    _p = '../1750381792388.ofd'
+    # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
+    save_dir = "/data/fangjiasheng/format_conversion_maxcompute/format_convert/temp" + '/'
+    c = OfdConvert(_p, save_dir)
+    _html = c.get_html()
+    print(_html)
+
+

+ 352 - 51
format_convert/convert_pdf.py

@@ -1,3 +1,6 @@
+import shutil
+import zlib
+from glob import glob
 import copy
 import copy
 import io
 import io
 import os
 import os
@@ -23,10 +26,12 @@ from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
 from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
     LTTextBoxVertical, LTLine, LTTextContainer, LTTextLine
     LTTextBoxVertical, LTLine, LTTextContainer, LTTextLine
 from format_convert.utils import judge_error_code, get_platform, LineTable, log, \
 from format_convert.utils import judge_error_code, get_platform, LineTable, log, \
-    memory_decorator, get_garble_code, get_md5_from_bytes, bytes2np, bbox_iou, get_garble_code2, get_traditional_chinese
+    memory_decorator, get_garble_code, get_md5_from_bytes, bytes2np, bbox_iou, get_garble_code2, \
+    get_traditional_chinese, ascii85_decode
 import fitz
 import fitz
 from format_convert.wrapt_timeout_decorator import timeout
 from format_convert.wrapt_timeout_decorator import timeout
 from otr.table_line_pdf import table_line_pdf
 from otr.table_line_pdf import table_line_pdf
+from botr.extract_table import get_b_table_by_blank_colon
 
 
 
 
 @memory_decorator
 @memory_decorator
@@ -38,6 +43,7 @@ def pdf2text(path, unique_type_dir):
 def pdf_analyze(interpreter, page, device, page_no):
 def pdf_analyze(interpreter, page, device, page_no):
     pdf_time = time.time()
     pdf_time = time.time()
     interpreter.process_page(page)
     interpreter.process_page(page)
+    # print('interpreter.process_page time', time.time()-pdf_time)
     layout = device.get_result()
     layout = device.get_result()
     log("page_no: " + str(page_no) + " pdf_analyze cost: " + str(time.time() - pdf_time))
     log("page_no: " + str(page_no) + " pdf_analyze cost: " + str(time.time() - pdf_time))
     return layout
     return layout
@@ -76,7 +82,7 @@ def read_pdfplumber(path, laparams):
 
 
 
 
 class PDFConvert:
 class PDFConvert:
-    def __init__(self, path, unique_type_dir, need_page_no):
+    def __init__(self, path, unique_type_dir, need_page_no, page_need_to_image_dict=None):
         self._doc = _Document(path)
         self._doc = _Document(path)
         self.path = path
         self.path = path
         self.unique_type_dir = unique_type_dir
         self.unique_type_dir = unique_type_dir
@@ -89,7 +95,7 @@ class PDFConvert:
         self.end_page_no = None
         self.end_page_no = None
         # 默认使用limit_page_cnt控制,前10页后10页
         # 默认使用limit_page_cnt控制,前10页后10页
         if self.need_page_no is None:
         if self.need_page_no is None:
-            self.limit_page_cnt = 20
+            self.limit_page_cnt = 50
         else:
         else:
             # 使用start_page_no,end_page_no范围控制,例如2,5
             # 使用start_page_no,end_page_no范围控制,例如2,5
             ss = self.need_page_no.split(',')
             ss = self.need_page_no.split(',')
@@ -120,6 +126,12 @@ class PDFConvert:
         # 初始化_page
         # 初始化_page
         self._page = _Page(None, 0)
         self._page = _Page(None, 0)
 
 
+        # 需要直接转成image来识别的页面
+        if type(page_need_to_image_dict) is not dict:
+            self.page_need_to_image_dict = {}
+        else:
+            self.page_need_to_image_dict = page_need_to_image_dict
+
     @memory_decorator
     @memory_decorator
     def init_package(self, package_name):
     def init_package(self, package_name):
         # 各个包初始化
         # 各个包初始化
@@ -128,7 +140,9 @@ class PDFConvert:
                                 char_margin=0.3,
                                 char_margin=0.3,
                                 line_margin=0.01,
                                 line_margin=0.01,
                                 word_margin=0.01,
                                 word_margin=0.01,
-                                boxes_flow=0.1, )
+                                # boxes_flow=0.1,
+                                boxes_flow=None,
+                                )
             if package_name == self.packages[0]:
             if package_name == self.packages[0]:
                 self.doc_pdfminer, self.device, self.interpreter = read_pdfminer(self.path, laparams)
                 self.doc_pdfminer, self.device, self.interpreter = read_pdfminer(self.path, laparams)
                 self.has_init_pdf[0] = 1
                 self.has_init_pdf[0] = 1
@@ -153,7 +167,7 @@ class PDFConvert:
             self._doc.error_code = [-3]
             self._doc.error_code = [-3]
 
 
     @memory_decorator
     @memory_decorator
-    def convert(self, limit_page_cnt=20):
+    def convert(self, limit_page_cnt=50):
         if self.has_init_pdf[0] == 0:
         if self.has_init_pdf[0] == 0:
             self.init_package("pdfminer")
             self.init_package("pdfminer")
         if self._doc.error_code is not None:
         if self._doc.error_code is not None:
@@ -201,8 +215,11 @@ class PDFConvert:
                     continue
                     continue
             # 限制pdf页数,只取前后各10页
             # 限制pdf页数,只取前后各10页
             else:
             else:
-                if page_count > limit_page_cnt and int(limit_page_cnt / 2) <= page_no < page_count - int(
-                        limit_page_cnt / 2):
+                # if page_count > limit_page_cnt and int(limit_page_cnt / 2) <= page_no < page_count - int(
+                #         limit_page_cnt / 2):
+                #     page_no += 1
+                #     continue
+                if page_count > limit_page_cnt and page_no >= limit_page_cnt:
                     page_no += 1
                     page_no += 1
                     continue
                     continue
 
 
@@ -222,6 +239,8 @@ class PDFConvert:
         delete_water_mark_list = []
         delete_water_mark_list = []
 
 
         for layout, layout_obj_list, max_y, page_no in layout_list:
         for layout, layout_obj_list, max_y, page_no in layout_list:
+            # for obj in layout_obj_list:
+            #     print('obj', obj)
             # 解析单页
             # 解析单页
             start_time = time.time()
             start_time = time.time()
             self._page = _Page(None, page_no)
             self._page = _Page(None, page_no)
@@ -251,7 +270,10 @@ class PDFConvert:
                 find_flag = 0
                 find_flag = 0
                 add_page_list = []
                 add_page_list = []
                 for page in pages:
                 for page in pages:
-                    if not int(limit_page_cnt / 2) <= page_no < page_count - int(limit_page_cnt / 2):
+                    # if not int(limit_page_cnt / 2) <= page_no < page_count - int(limit_page_cnt / 2):
+                    #     page_no += 1
+                    #     continue
+                    if not (page_no >= limit_page_cnt):
                         page_no += 1
                         page_no += 1
                         continue
                         continue
 
 
@@ -297,9 +319,11 @@ class PDFConvert:
                     page_no += 1
                     page_no += 1
 
 
                 if add_page_list:
                 if add_page_list:
-                    self._doc.children = self._doc.children[
-                                         :int(limit_page_cnt / 2)] + add_page_list + self._doc.children[
-                                                                                     int(limit_page_cnt / 2):]
+                    # self._doc.children = self._doc.children[:int(limit_page_cnt / 2)] \
+                    #                      + add_page_list \
+                    #                      + self._doc.children[int(limit_page_cnt / 2):]
+                    self._doc.children = self._doc.children[:limit_page_cnt] \
+                                         + add_page_list
 
 
         self.delete_same_image()
         self.delete_same_image()
         # self.delete_bold_text_duplicate()
         # self.delete_bold_text_duplicate()
@@ -375,10 +399,14 @@ class PDFConvert:
 
 
         return pages, delete_footer_header_list
         return pages, delete_footer_header_list
 
 
+    @memory_decorator
     def delete_bold_text_duplicate(self, lt_text_box_list):
     def delete_bold_text_duplicate(self, lt_text_box_list):
         # 拿出所有LTChar
         # 拿出所有LTChar
         lt_char_list = []
         lt_char_list = []
         for lt_text_box in lt_text_box_list:
         for lt_text_box in lt_text_box_list:
+            if '.......' in lt_text_box.get_text():
+                # print('....... lt_text_box continue')
+                continue
             for lt_text_line in lt_text_box:
             for lt_text_line in lt_text_box:
                 for lt_char in lt_text_line:
                 for lt_char in lt_text_line:
                     if isinstance(lt_char, LTChar):
                     if isinstance(lt_char, LTChar):
@@ -447,14 +475,16 @@ class PDFConvert:
     def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
     def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
         list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list,
         list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list,
                                                                                     from_pdf=True, is_reverse=False)
                                                                                     from_pdf=True, is_reverse=False)
-        self._page.in_table_objs = filter_objs
+        # self._page.in_table_objs = filter_objs
 
 
         # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
         # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
 
 
+        table_list = []
         for table in list_tables:
         for table in list_tables:
             _table = _Table(table["table"], table["bbox"])
             _table = _Table(table["table"], table["bbox"])
             # self._page.children.append(_table)
             # self._page.children.append(_table)
             self._page.add_child(_table)
             self._page.add_child(_table)
+            table_list.append(_table)
 
 
         list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
         list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
                                                         layout.bbox, page_no)
                                                         layout.bbox, page_no)
@@ -466,7 +496,7 @@ class PDFConvert:
         # pdf对象需反向排序
         # pdf对象需反向排序
         # self._page.is_reverse = True
         # self._page.is_reverse = True
 
 
-        return list_tables
+        return table_list
 
 
     def is_text_legal(self, lt_text_list, page_no):
     def is_text_legal(self, lt_text_list, page_no):
         # 无法识别pdf字符编码,整页用ocr
         # 无法识别pdf字符编码,整页用ocr
@@ -498,10 +528,11 @@ class PDFConvert:
 
 
         return True
         return True
 
 
+    @memory_decorator
     def judge_b_table(self, lt_text_list, table_list, page_no):
     def judge_b_table(self, lt_text_list, table_list, page_no):
         table_h_list = []
         table_h_list = []
         for table in table_list:
         for table in table_list:
-            table_h_list.append([table.get('bbox')[1], table.get('bbox')[3]])
+            table_h_list.append([table.bbox[1], table.bbox[3]])
 
 
         # 先分行
         # 先分行
         lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
         lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
@@ -528,6 +559,8 @@ class PDFConvert:
         row_cnt = 0
         row_cnt = 0
         b_table_row_list = []
         b_table_row_list = []
         all_b_table = []
         all_b_table = []
+        row_col_list = []
+        all_row_col_list = []
         for row in lt_text_row_list:
         for row in lt_text_row_list:
             # 水印行跳过
             # 水印行跳过
             if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
             if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
@@ -537,6 +570,7 @@ class PDFConvert:
             for r in row:
             for r in row:
                 if re.search('[.·]{7,}', r.get_text()):
                 if re.search('[.·]{7,}', r.get_text()):
                     continue_flag = True
                     continue_flag = True
+                    all_row_col_list = []
                     break
                     break
             if continue_flag:
             if continue_flag:
                 continue
                 continue
@@ -550,6 +584,7 @@ class PDFConvert:
                     row_cnt += 1
                     row_cnt += 1
                     t_cnt = 0
                     t_cnt = 0
                     b_table_row_list += row
                     b_table_row_list += row
+                    row_col_list += [row]
                 else:
                 else:
                     # 容忍
                     # 容忍
                     if t_cnt < tolerate_cnt:
                     if t_cnt < tolerate_cnt:
@@ -557,15 +592,36 @@ class PDFConvert:
                         continue
                         continue
                     if b_table_row_list and row_cnt >= is_b_table_cnt:
                     if b_table_row_list and row_cnt >= is_b_table_cnt:
                         all_b_table.append(b_table_row_list)
                         all_b_table.append(b_table_row_list)
+                        all_row_col_list.append(row_col_list)
                     row_cnt = 0
                     row_cnt = 0
                     b_table_row_list = []
                     b_table_row_list = []
+                    row_col_list = []
             else:
             else:
                 row_cnt += 1
                 row_cnt += 1
                 t_cnt = 0
                 t_cnt = 0
                 b_table_row_list += row
                 b_table_row_list += row
+                row_col_list += [row]
 
 
         if b_table_row_list and row_cnt >= is_b_table_cnt:
         if b_table_row_list and row_cnt >= is_b_table_cnt:
             all_b_table.append(b_table_row_list)
             all_b_table.append(b_table_row_list)
+            all_row_col_list.append(row_col_list)
+            # print('b_table_row_list', b_table_row_list)
+
+        # 排除大部分是两列的,因为前面已经新增了两列无边框的单独识别
+        # print('len(all_row_col_list)', len(all_row_col_list))
+        row_cnt = 0
+        col_2_cnt = 0
+        for row_col_list in all_row_col_list:
+            for col_list in row_col_list:
+                row_cnt += 1
+                if len(col_list) == 2:
+                    col_2_cnt += 1
+                # print('col_list', col_list)
+
+        # print('row_cnt, col_2_cnt', row_cnt, col_2_cnt)
+        if row_cnt == 0 or col_2_cnt / row_cnt >= 0.5:
+            log("page_no: " + str(page_no) + ' is_b_table_flag False')
+            return False
 
 
         # 对每个可能的b_table判断是否与table相交
         # 对每个可能的b_table判断是否与table相交
         is_b_table_flag = False
         is_b_table_flag = False
@@ -587,8 +643,35 @@ class PDFConvert:
                 # print('table_h_list', table_h_list)
                 # print('table_h_list', table_h_list)
                 break
                 break
         log("page_no: " + str(page_no) + ' is_b_table_flag ' + str(is_b_table_flag))
         log("page_no: " + str(page_no) + ' is_b_table_flag ' + str(is_b_table_flag))
+        # 保存判断为True的pdf
+        # if is_b_table_flag:
+        #     self.save_b_table_pdf(page_no)
         return is_b_table_flag
         return is_b_table_flag
 
 
+    def save_b_table_pdf(self, page_no):
+        # save_dir = r"D:\Project\format_conversion_maxcompute\save_b_table_pdf"
+        save_dir = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table_pdf'
+        max_index = 200
+        if os.path.exists(save_dir):
+            file_list = glob(save_dir + '/*')
+            if file_list:
+                file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
+                file_index_list.sort(key=lambda x: x)
+                index = file_index_list[-1] + 1
+            else:
+                index = 0
+            if index > max_index:
+                return
+        else:
+            return
+
+        save_path = f'{save_dir}/{index}-{page_no}.pdf'
+        try:
+            shutil.copy(self.path, save_path)
+            print("文件复制成功!")
+        except Exception as e:
+            print(f"文件复制失败:{e}")
+
     def char_to_text_box(self, char_list):
     def char_to_text_box(self, char_list):
         lt_text_box_list = []
         lt_text_box_list = []
 
 
@@ -646,6 +729,7 @@ class PDFConvert:
 
 
         return lt_text_box_list, text_box_char_dict
         return lt_text_box_list, text_box_char_dict
 
 
+    @memory_decorator
     def get_need_objs(self, obj_list, max_y):
     def get_need_objs(self, obj_list, max_y):
         # 文字
         # 文字
         lt_char_list = []
         lt_char_list = []
@@ -695,6 +779,14 @@ class PDFConvert:
             elif isinstance(x, (LTTextContainer, LTRect, LTLine, LTCurve)):
             elif isinstance(x, (LTTextContainer, LTRect, LTLine, LTCurve)):
                 lt_line_list.append(x)
                 lt_line_list.append(x)
 
 
+        # print('len(obj_list)', len(obj_list))
+        # print('len(lt_char_list)', len(lt_char_list))
+        # print('len(lt_text_box_list)', len(lt_text_box_list))
+        # if len(lt_text_box_list) >= 200:
+        #     for lt_text in lt_text_box_list:
+        #         print('>= 200 lt_text', lt_text.get_text())
+        # print('len(lt_image_list)', len(lt_image_list))
+
         if lt_figure_list:
         if lt_figure_list:
             temp_figure_list = []
             temp_figure_list = []
             for sub_figure in lt_figure_list:
             for sub_figure in lt_figure_list:
@@ -719,8 +811,21 @@ class PDFConvert:
 
 
         text_box_char_dict = {**text_box_char_dict, **add_text_box_char_dict}
         text_box_char_dict = {**text_box_char_dict, **add_text_box_char_dict}
 
 
+        lt_text_box_list = self.delete_water_mark_by_location(lt_text_box_list)
+
+        # 分行后过滤
+        temp_list = []
+        for lt_text_box in lt_text_box_list:
+            if lt_text_box.get_text() in ['', ' ', '\t', '\n', '\r']:
+                continue
+            temp_list.append(lt_text_box)
+        if len(lt_text_box_list) != len(temp_list):
+            log('filter lt_text_box_list ' + str(len(lt_text_box_list)) + ' -> ' + str(len(temp_list)))
+        lt_text_box_list = temp_list
+
         return lt_char_list, lt_text_box_list, lt_image_list, lt_figure_list, lt_line_list, text_box_char_dict
         return lt_char_list, lt_text_box_list, lt_image_list, lt_figure_list, lt_line_list, text_box_char_dict
 
 
+    @memory_decorator
     def read_layout(self, page, page_no):
     def read_layout(self, page, page_no):
         layout = self.get_layout(page, page_no)
         layout = self.get_layout(page, page_no)
         if self._doc.error_code is not None:
         if self._doc.error_code is not None:
@@ -834,6 +939,7 @@ class PDFConvert:
 
 
         return lt_text_box_list
         return lt_text_box_list
 
 
+    @memory_decorator
     def split_text_box_by_lines2(self, lt_line_list, lt_text_box_list, text_box_char_dict):
     def split_text_box_by_lines2(self, lt_line_list, lt_text_box_list, text_box_char_dict):
         """
         """
         有单个字符位置信息,再根据表格线截断位置,分割text
         有单个字符位置信息,再根据表格线截断位置,分割text
@@ -932,12 +1038,23 @@ class PDFConvert:
         return lt_text_box_list
         return lt_text_box_list
 
 
     @memory_decorator
     @memory_decorator
-    # def convert_page(self, page, page_no, skip_image=0):
     def convert_page(self, layout, layout_obj_list, max_y, page_no, delete_water_mark_list, skip_image=0):
     def convert_page(self, layout, layout_obj_list, max_y, page_no, delete_water_mark_list, skip_image=0):
         # 若Page中一个obj都无,后面ocr整页识别 20240820
         # 若Page中一个obj都无,后面ocr整页识别 20240820
         if max_y == 0 and len(layout_obj_list) > 0:
         if max_y == 0 and len(layout_obj_list) > 0:
             return
             return
 
 
+        # 若该页在page_need_to_image_dict中为True,则直接ocr整页识别
+        if self.page_need_to_image_dict.get(page_no) is True:
+            page_image = self.get_page_image(page_no)
+            if judge_error_code(page_image):
+                self._page.error_code = page_image
+            else:
+                _image = _Image(page_image[1], page_image[0])
+                _image.is_from_pdf = True
+                _image.is_reverse = False
+                self._page.add_child(_image)
+            return
+
         lt_char_list, lt_text_box_list, lt_image_list, lt_figure_list, \
         lt_char_list, lt_text_box_list, lt_image_list, lt_figure_list, \
             lt_line_list, text_box_char_dict = layout_obj_list
             lt_line_list, text_box_char_dict = layout_obj_list
 
 
@@ -999,45 +1116,56 @@ class PDFConvert:
         # 正常读取该页对象
         # 正常读取该页对象
         else:
         else:
             # 图表对象
             # 图表对象
-            for image in lt_image_list:
-                try:
-                    # print("pdf2text LTImage size", page_no, image.width, image.height)
-                    image_stream = image.stream.get_data()
-                    # 小的图忽略
-                    if image.width <= 300 and image.height <= 300:
-                        continue
-                    # 查看提取的图片高宽,太大则用pdf输出图进行ocr识别
-                    img_test = Image.open(io.BytesIO(image_stream))
-                    if image.height >= 1000 and image.width >= 1000:
-                        page_image = self.get_page_image(page_no)
-                        if judge_error_code(page_image):
-                            self._page.error_code = page_image
-                        else:
-                            _image = _Image(page_image[1], page_image[0])
-                            _image.is_from_pdf = True
-                            _image.is_reverse = False
-                            self._page.add_child(_image)
-                            image_md5 = get_md5_from_bytes(page_image[1])
-                            self.md5_image_obj_list.append([image_md5, _image])
-                        return
-                    # 比较小的图则直接保存用ocr识别
-                    else:
-                        temp_path = self.unique_type_dir + 'page' + str(page_no) \
-                                    + '_lt' + str(lt_image_list.index(image)) + '.jpg'
-                        img_test.save(temp_path)
-                        with open(temp_path, "rb") as ff:
-                            image_stream = ff.read()
-                        _image = _Image(image_stream, temp_path, image.bbox)
-                        self._page.add_child(_image)
-                        image_md5 = get_md5_from_bytes(image_stream)
-                        self.md5_image_obj_list.append([image_md5, _image])
-                except Exception:
-                    log("page_no: " + str(page_no) + " pdfminer read image fail! use pymupdf read image...")
-                    traceback.print_exc()
+            # for image in lt_image_list:
+            #     try:
+            #         # print("pdf2text LTImage size", page_no, image.width, image.height)
+            #         # image_stream = image.stream.get_data()
+            #         print('image.stream.get_filters()', image.stream.get_filters())
+            #         image_stream = image.stream.get_data()
+            #         # 小的图忽略
+            #         if image.width <= 300 and image.height <= 300:
+            #             continue
+            #         # 查看提取的图片高宽,太大则用pdf输出图进行ocr识别
+            #         img_test = Image.open(io.BytesIO(image_stream))
+            #         # img_test = self.pdfminer_stream_to_image(image)
+            #         if image.height >= 1000 and image.width >= 1000:
+            #             page_image = self.get_page_image(page_no)
+            #             if judge_error_code(page_image):
+            #                 self._page.error_code = page_image
+            #             else:
+            #                 _image = _Image(page_image[1], page_image[0])
+            #                 _image.is_from_pdf = True
+            #                 _image.is_reverse = False
+            #                 self._page.add_child(_image)
+            #                 image_md5 = get_md5_from_bytes(page_image[1])
+            #                 self.md5_image_obj_list.append([image_md5, _image])
+            #             return
+            #         # 比较小的图则直接保存用ocr识别
+            #         else:
+            #             temp_path = self.unique_type_dir + 'page' + str(page_no) \
+            #                         + '_lt' + str(lt_image_list.index(image)) + '.jpg'
+            #             img_test.save(temp_path)
+            #             with open(temp_path, "rb") as ff:
+            #                 image_stream = ff.read()
+            #             _image = _Image(image_stream, temp_path, image.bbox)
+            #             self._page.add_child(_image)
+            #             image_md5 = get_md5_from_bytes(image_stream)
+            #             self.md5_image_obj_list.append([image_md5, _image])
+            #     except Exception:
+            #         log("page_no: " + str(page_no) + " pdfminer read image fail! use pymupdf read image...")
+            #         traceback.print_exc()
 
 
             # pdf对象需反向排序
             # pdf对象需反向排序
             # self._page.is_reverse = True
             # self._page.is_reverse = True
 
 
+            status = self.pdfminer_read_page_images(lt_image_list, page_no)
+            if not status:
+                log('pymupdf 提取页面中图片 page_no: ' + str(page_no))
+                status = self.pymupdf_read_page_images(page_no)
+            if not status:
+                log('pymupdf 整页转化为图片 page_no: ' + str(page_no))
+                status = self.pymupdf_get_whole_page_image(page_no)
+
             if self.has_init_pdf[3] == 0:
             if self.has_init_pdf[3] == 0:
                 self.init_package("pdfplumber")
                 self.init_package("pdfplumber")
 
 
@@ -1059,7 +1187,24 @@ class PDFConvert:
             table_list = self.recognize_text(layout, page_no, lt_text_box_list, lt_line_list)
             table_list = self.recognize_text(layout, page_no, lt_text_box_list, lt_line_list)
 
 
             # 根据text规律,判断该页是否可能有无边框表格
             # 根据text规律,判断该页是否可能有无边框表格
+            try:
+                b_table_list, _ = get_b_table_by_blank_colon(lt_text_box_list, table_list, layout.bbox, None)
+            except:
+                traceback.print_exc()
+                b_table_list = []
+                self._page.error_code = [-23]
+
+            if b_table_list:
+                for table in b_table_list:
+                    _table = _Table(table[0], table[1])
+                    table_list += [_table]
+                    self._page.add_child(_table)
+
+            for t in table_list:
+                self._page.table_bbox_list.append(t.bbox)
+
             if self.judge_b_table(lt_text_box_list, table_list, page_no):
             if self.judge_b_table(lt_text_box_list, table_list, page_no):
+                # log('judge_b_table match! ' + str(page_no))
                 page_image = self.get_page_image(page_no)
                 page_image = self.get_page_image(page_no)
                 if judge_error_code(page_image):
                 if judge_error_code(page_image):
                     self._page.error_code = page_image
                     self._page.error_code = page_image
@@ -1073,6 +1218,7 @@ class PDFConvert:
                     _image.b_table_layout_size = (layout.width, layout.height)
                     _image.b_table_layout_size = (layout.width, layout.height)
                     self._page.add_child(_image)
                     self._page.add_child(_image)
 
 
+    @memory_decorator
     def get_layout(self, page, page_no):
     def get_layout(self, page, page_no):
         if self.has_init_pdf[0] == 0:
         if self.has_init_pdf[0] == 0:
             self.init_package("pdfminer")
             self.init_package("pdfminer")
@@ -1096,6 +1242,7 @@ class PDFConvert:
         log("page_no: " + str(page_no) + " get_layout cost: " + str(time.time() - start_time))
         log("page_no: " + str(page_no) + " get_layout cost: " + str(time.time() - start_time))
         return layout
         return layout
 
 
+    @memory_decorator
     def get_page_image(self, page_no):
     def get_page_image(self, page_no):
         start_time = time.time()
         start_time = time.time()
         try:
         try:
@@ -1503,6 +1650,7 @@ class PDFConvert:
             return [-12]
             return [-12]
         return html
         return html
 
 
+    @memory_decorator
     def delete_water_mark(self, lt_text_list, page_bbox, times=5):
     def delete_water_mark(self, lt_text_list, page_bbox, times=5):
         # 删除过多重复字句,为水印
         # 删除过多重复字句,为水印
         duplicate_dict = {}
         duplicate_dict = {}
@@ -1540,6 +1688,32 @@ class PDFConvert:
                 temp_text_list.append(_obj)
                 temp_text_list.append(_obj)
         return temp_text_list, delete_text
         return temp_text_list, delete_text
 
 
+    @memory_decorator
+    def delete_water_mark_by_location(self, lt_text_box_list):
+        x_text_box_dict = {}
+        # 水印,x坐标相同,且长度为1
+        for lt_text_box in lt_text_box_list:
+            x1, y1, x2, y2 = lt_text_box.bbox
+            text = lt_text_box.get_text()
+            if len(text) != 1:
+                continue
+            key = f'{x1}-{x2}-{text}'
+            if key in x_text_box_dict:
+                x_text_box_dict[key] += [lt_text_box]
+            else:
+                x_text_box_dict[key] = [lt_text_box]
+
+        len1 = len(lt_text_box_list)
+        for key, box_list in x_text_box_dict.items():
+            if len(box_list) >= 3:
+                for box in box_list:
+                    if box in lt_text_box_list:
+                        lt_text_box_list.remove(box)
+        len2 = len(lt_text_box_list)
+        if len1 != len2:
+            log('delete_water_mark_by_location box num ' + str(len1) + ' -> ' + str(len2))
+        return lt_text_box_list
+
     def delete_water_mark_by_color(self, lt_text_list):
     def delete_water_mark_by_color(self, lt_text_list):
         # 删除浅色字体,大概率为水印
         # 删除浅色字体,大概率为水印
         # 1. 单个char颜色透明度0.8以上
         # 1. 单个char颜色透明度0.8以上
@@ -1587,6 +1761,9 @@ class PDFConvert:
         water_mark_text_box_list = []
         water_mark_text_box_list = []
         sin_range = [0.3, 0.94]
         sin_range = [0.3, 0.94]
         for lt_text_box in lt_text_list:
         for lt_text_box in lt_text_list:
+            if '.......' in lt_text_box.get_text():
+                # print('....... lt_text_box continue')
+                continue
             for lt_text_line in lt_text_box:
             for lt_text_line in lt_text_box:
                 for lt_char in lt_text_line:
                 for lt_char in lt_text_line:
                     matrix = lt_char.matrix
                     matrix = lt_char.matrix
@@ -1634,6 +1811,126 @@ class PDFConvert:
             log("page_no: " + str(page_no) + " get_single_pdf error!")
             log("page_no: " + str(page_no) + " get_single_pdf error!")
             return [-3]
             return [-3]
 
 
+    def pymupdf_read_page_images(self, page_no):
+        try:
+            self.init_package("PyMuPDF")
+            # 获取指定页面
+            page = self.doc_pymupdf.load_page(page_no)
+            # 获取页面中所有图片的信息
+            image_list = page.get_images(full=True)
+
+            # 存储提取的图片信息
+            extracted_images = []
+
+            # 遍历图片列表
+            for img_index, img_info in enumerate(image_list):
+                xref = img_info[0]  # 图片xref编号
+                base_image = self.doc_pymupdf.extract_image(xref)
+                image_bytes = base_image["image"]  # 图片字节数据
+                image_ext = base_image["ext"]  # 图片扩展名
+
+                # 获取图片在页面中的位置和大小
+                bbox = img_info[0:4]  # x0, y0, x1, y1
+                # print('img_info', img_info)
+                width = img_info[2] - img_info[0]  # 计算宽度
+                height = img_info[3] - img_info[1]  # 计算高度
+
+                # 构建图片信息字典
+                img_data = {
+                    "xref": xref,
+                    "width": width,
+                    "height": height,
+                    "image": image_bytes,
+                    "ext": image_ext,
+                    "bbox": bbox
+                }
+                extracted_images.append(img_data)
+
+            image_obj_list = []
+            for index, d in enumerate(extracted_images):
+                temp_path = self.unique_type_dir + 'page' + str(page_no) \
+                            + '_lt2_' + str(index) + '.jpg'
+                image_bytes = d.get("image")
+                bbox = d.get('bbox')
+                with open(temp_path, 'wb') as f:
+                    f.write(image_bytes)
+
+                _image = _Image(image_bytes, temp_path, bbox)
+                image_md5 = get_md5_from_bytes(image_bytes)
+                image_obj_list.append([_image, image_md5])
+        except:
+            traceback.print_exc()
+            return False
+
+        for _image, image_md5 in image_obj_list:
+            self._page.add_child(_image)
+            self.md5_image_obj_list.append([image_md5, _image])
+        return True
+
+    def pymupdf_get_whole_page_image(self, page_no):
+        image_obj_list = []
+        page_image = self.get_page_image(page_no)
+        if judge_error_code(page_image):
+            self._page.error_code = page_image
+            return False
+        else:
+            _image = _Image(page_image[1], page_image[0])
+            _image.is_from_pdf = True
+            _image.is_reverse = False
+            image_md5 = get_md5_from_bytes(page_image[1])
+            image_obj_list.append([_image, image_md5])
+
+        for _image, image_md5 in image_obj_list:
+            self._page.add_child(_image)
+            self.md5_image_obj_list.append([image_md5, _image])
+        return True
+
+    def pdfminer_read_page_images(self, lt_image_list, page_no):
+        # 图表对象
+        image_obj_list = []
+        for image in lt_image_list:
+            try:
+                # print("pdf2text LTImage size", page_no, image.width, image.height)
+                # image_stream = image.stream.get_data()
+                # print('image.stream.get_filters()', image.stream.get_filters())
+                image_stream = image.stream.get_data()
+                # 小的图忽略
+                if image.width <= 300 and image.height <= 300:
+                    continue
+                # 查看提取的图片高宽,太大则用pdf输出图进行ocr识别
+                img_test = Image.open(io.BytesIO(image_stream))
+                # img_test = self.pdfminer_stream_to_image(image)
+                # if image.height >= 1000 and image.width >= 1000:
+                #     page_image = self.get_page_image(page_no)
+                #     if judge_error_code(page_image):
+                #         self._page.error_code = page_image
+                #     else:
+                #         _image = _Image(page_image[1], page_image[0])
+                #         _image.is_from_pdf = True
+                #         _image.is_reverse = False
+                #         image_md5 = get_md5_from_bytes(page_image[1])
+                #         image_obj_list.append([_image, image_md5])
+                # # 比较小的图则直接保存用ocr识别
+                # else:
+                temp_path = self.unique_type_dir + 'page' + str(page_no) \
+                            + '_lt_' + str(lt_image_list.index(image)) + '.jpg'
+                img_test.save(temp_path)
+                with open(temp_path, "rb") as ff:
+                    image_stream = ff.read()
+                _image = _Image(image_stream, temp_path, image.bbox)
+                self._page.add_child(_image)
+                image_md5 = get_md5_from_bytes(image_stream)
+                self.md5_image_obj_list.append([image_md5, _image])
+            except Exception:
+                log("page_no: " + str(page_no) + " pdfminer read image fail!")
+                traceback.print_exc()
+                return False
+
+        for _image, image_md5 in image_obj_list:
+            self._page.add_child(_image)
+            self.md5_image_obj_list.append([image_md5, _image])
+        return True
+
 
 
 def get_text_font():
 def get_text_font():
     def flags_decomposer(flags):
     def flags_decomposer(flags):
@@ -1999,4 +2296,8 @@ class ParseUtils:
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    PDFConvert(r"C:/Users/Administrator/Downloads/1651896704621.pdf", "C:/Users/Administrator/Downloads/1").get_html()
+    _pp = r'D:\Project\format_conversion_maxcompute\save_b_table_pdf/e-116.pdf'
+    # _pp = r'C:\Users\Administrator\Downloads\1746582280828.pdf'
+    _html = PDFConvert(_pp, r"D:\Project\format_conversion_maxcompute\format_convert\temp", None).get_html()
+    with open('../result.html', 'w', encoding='utf-8') as f:
+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])

+ 30 - 36
format_convert/convert_test.py

@@ -11,15 +11,6 @@ from glob import glob
 import requests
 import requests
 
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
-from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTLine
-from pdfminer.pdfdocument import PDFDocument
-from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
-from pdfminer.pdfpage import PDFPage
-from pdfminer.pdfparser import PDFParser
-from pdfplumber import PDF
-
-from otr.table_line_pdf import _plot
 
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 from format_convert.utils import get_platform, request_post, get_md5_from_bytes
 from format_convert.utils import get_platform, request_post, get_md5_from_bytes
@@ -44,7 +35,7 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range,
     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range,
             'timeout': timeout, 'save_middle': save_middle}
             'timeout': timeout, 'save_middle': save_middle}
 
 
-    # _url = 'http://121.46.18.113:15010/convert'
+    # _url = 'http://dianxin.bidizhaobiao.com:15010/convert'
     # _url = 'http://192.168.2.103:15010/convert'
     # _url = 'http://192.168.2.103:15010/convert'
     # _url = 'http://192.168.2.102:15010/convert'
     # _url = 'http://192.168.2.102:15010/convert'
     # _url = 'http://172.16.160.65:15010/convert'
     # _url = 'http://172.16.160.65:15010/convert'
@@ -53,7 +44,7 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
     text_str = ""
     text_str = ""
     try:
     try:
         result = json.loads(request_post(_url, data, time_out=timeout+20))
         result = json.loads(request_post(_url, data, time_out=timeout+20))
-
+        print('result', result)
         for t in result.get("result_html"):
         for t in result.get("result_html"):
             text_str += t
             text_str += t
         to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
         to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
@@ -67,7 +58,7 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
                 to_html(new_path, text_str)
                 to_html(new_path, text_str)
 
 
         print(_md5)
         print(_md5)
-        print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
+        # print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
         print("result_text", result.get("result_text")[0][:20])
         print("result_text", result.get("result_text")[0][:20])
         print("is_success", result.get("is_success"))
         print("is_success", result.get("is_success"))
     except:
     except:
@@ -80,7 +71,6 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
     return p, 1
     return p, 1
 
 
 
 
-
 def test_path():
 def test_path():
     # _url = 'http://121.46.18.113:15010/convert'
     # _url = 'http://121.46.18.113:15010/convert'
     _url = 'http://192.168.0.115:15010/convert'
     _url = 'http://192.168.0.115:15010/convert'
@@ -186,21 +176,25 @@ def test_kimi():
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
     if get_platform() == "Windows":
     if get_platform() == "Windows":
-        # file_path = "C:/Users/Administrator/Downloads/1672314827836.pdf"
+        # file_path = "C:/Users/Administrator/Downloads/1750737587843.ofd"
+        # file_path = r'D:\Project\format_conversion_maxcompute\save_b_table_pdf/e-1.pdf'
         # file_path = "D:/BIDI_DOC/比地_文档/1677829036789.pdf"
         # file_path = "D:/BIDI_DOC/比地_文档/1677829036789.pdf"
 
 
-        # file_path = "C:/Users/Administrator/Desktop/test_xls/error7.xls"
-        # file_path = "C:/Users/Administrator/Desktop/test_doc/error15.doc"
-        # file_path = "C:/Users/Administrator/Desktop/test_swf/error1.swf"
+        # file_path = "C:/Users/Administrator/Desktop/test_xls/error4.xlsx"
+        # file_path = "C:/Users/Administrator/Desktop/test_doc/error17.docx"
+        # file_path = "C:/Users/Administrator/Desktop/test_swf/error2.swf"
         # file_path = "C:/Users/Administrator/Desktop/test_rar/error1.rar"
         # file_path = "C:/Users/Administrator/Desktop/test_rar/error1.rar"
-        file_path = "C:/Users/Administrator/Desktop/test_image/error7.png"
-        # file_path = "C:/Users/Administrator/Desktop/test_b_table/error13.pdf"
-        # file_path = "C:/Users/Administrator/Desktop/test_pdf/表格连接error/error6.pdf"
+        # file_path = "C:/Users/Administrator/Desktop/test_image/error18.png"
+        # file_path = "C:/Users/Administrator/Desktop/test_b_table/error29.png"
+        # file_path = "C:/Users/Administrator/Desktop/test_pdf/普通error/error6.pdf"
         # file_path = "C:/Users/Administrator/Desktop/test_table_head/error2.pdf"
         # file_path = "C:/Users/Administrator/Desktop/test_table_head/error2.pdf"
+        # file_path = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
+        file_path = "C:/Users/Administrator/Desktop/test_ofd/1750381792388.ofd"
     else:
     else:
         file_path = "1660296734009.pdf"
         file_path = "1660296734009.pdf"
 
 
-    test_one(file_path, page_no_range='1,-1', timeout=1000, save_middle=None)
+    # test_one(file_path, page_no_range="1,-1", timeout=1000, save_middle=None)
+    test_one(file_path, page_no_range=None, timeout=1000, save_middle=None)
 
 
     # run_files()
     # run_files()
 
 
@@ -212,21 +206,21 @@ if __name__ == '__main__':
     # file_path = r"C:\Users\Administrator\Desktop\test_pdf\直接读表格线error/"
     # file_path = r"C:\Users\Administrator\Desktop\test_pdf\直接读表格线error/"
     # file_path = r"C:\Users\Administrator\Desktop\test_pdf\表格连接error/"
     # file_path = r"C:\Users\Administrator\Desktop\test_pdf\表格连接error/"
     # file_path = r"C:\Users\Administrator\Desktop\test_b_table/"
     # file_path = r"C:\Users\Administrator\Desktop\test_b_table/"
-    file_path = r"C:\Users\Administrator\Desktop\test_pdf\普通error/"
-    test_pdf_list = [['6df7f2bd5e8cac99a15a6c012e0d82a8.pdf', '34,52'],
-                     ['ca6a86753400d6dd6a1b324c5678b7fb.pdf', '18,69'],
-                     ['a8380bf795c71caf8185fb11395df138.pdf', '27,38'],
-                     ['7fd2ce6b08d086c98158b6f2fa0293b0.pdf', '32,48'],
-                     ['dd1adb4dc2014c7abcf403ef15a01eb5.pdf', '2,12'],
-                     ['error50.pdf', '1,-1'],
-                     ['error59.pdf', '1,-1'],
-                     ['error60.pdf', '1,-1'],
-                     ['error61.pdf', '1,-1'],
-                     ['error7.pdf', '39,57'],
-                     ['error8.pdf', '7,12'],
-                     ['error23.pdf', '1,-1']
-                     ]
-    index = 11
+    # file_path = r"C:\Users\Administrator\Desktop\test_pdf\普通error/"
+    # test_pdf_list = [['6df7f2bd5e8cac99a15a6c012e0d82a8.pdf', '34,52'],
+    #                  ['ca6a86753400d6dd6a1b324c5678b7fb.pdf', '18,69'],
+    #                  ['a8380bf795c71caf8185fb11395df138.pdf', '27,38'],
+    #                  ['7fd2ce6b08d086c98158b6f2fa0293b0.pdf', '32,48'],
+    #                  ['dd1adb4dc2014c7abcf403ef15a01eb5.pdf', '2,12'],
+    #                  ['error50.pdf', '1,-1'],
+    #                  ['error59.pdf', '1,-1'],
+    #                  ['error60.pdf', '1,-1'],
+    #                  ['error61.pdf', '1,-1'],
+    #                  ['error7.pdf', '39,57'],
+    #                  ['error8.pdf', '7,12'],
+    #                  ['error23.pdf', '1,-1']
+    #                  ]
+    # index = 11
     # test_one(file_path+test_pdf_list[index][0], page_no_range=test_pdf_list[index][1], from_remote=True)
     # test_one(file_path+test_pdf_list[index][0], page_no_range=test_pdf_list[index][1], from_remote=True)
 
 
 
 

+ 91 - 4
format_convert/convert_tree.py

@@ -61,6 +61,8 @@ class _Page:
         self.in_table_objs = set()
         self.in_table_objs = set()
         # 是否pdf
         # 是否pdf
         self.is_pdf = 0
         self.is_pdf = 0
+        # 所有表格范围
+        self.table_bbox_list = []
 
 
     def add_child(self, child):
     def add_child(self, child):
         if child.error_code is None:
         if child.error_code is None:
@@ -74,12 +76,66 @@ class _Page:
 
 
         self.children = sort_object(self.children, self.is_reverse)
         self.children = sort_object(self.children, self.is_reverse)
 
 
+        # 有图片类型,需返回图片中所有对象,并重新设置图片中的bbox,以及图片后的对象的bbox
+        image_add_y = 0
+        add_childern = []
+        for child in self.children:
+            if type(child) == _Image:
+                image_children = child.get_html(return_children=True)
+                if judge_error_code(image_children) and not self.is_pdf:
+                    self.error_code = image_children
+                    return self.error_code
+                if len(image_children) == 0:
+                    continue
+                image_children = sort_object(image_children, False)
+
+                # 单张图可能无bbox,但文档中的图有bbox
+                if child.bbox != (0, 0, 0, 0):
+                    for i_child in image_children:
+                        i_child.bbox = [i_child.bbox[0], i_child.bbox[1] + child.bbox[3] + image_add_y,
+                                        i_child.bbox[2], i_child.bbox[3] + child.bbox[3] + image_add_y
+                                        ]
+
+                image_add_y += image_children[-1].bbox[3]
+                add_childern += image_children
+                continue
+
+            # 图片对象后面的对象,bbox重新设置
+            child.bbox = [child.bbox[0], child.bbox[1] + image_add_y,
+                          child.bbox[2], child.bbox[3] + image_add_y
+                          ]
+            # self.children += child.get_html(return_children=True)
+
+        self.children += add_childern
+        self.children = sort_object(self.children, self.is_reverse)
+
+        # 获取所有table,计算bbox,排除在table中的sentence
+        for child in self.children:
+            if type(child) == _Table:
+                # table_bbox = get_table_bbox(child.content)
+                # print('table.content ', child.content)
+                # print('child.bbox', child.bbox)
+                self.table_bbox_list += [child.bbox]
+
         html_text = ""
         html_text = ""
         image_html = ""
         image_html = ""
         text_html = ""
         text_html = ""
         for child in self.children:
         for child in self.children:
+            if type(child) == _Image:
+                continue
+            if type(child) == _Sentence:
+                continue_flag = 0
+                for table_bbox in self.table_bbox_list:
+                    # print('table_bbox', table_bbox)
+                    if table_bbox[1] - 3 <= child.bbox[1] <= child.bbox[3] <= table_bbox[3] + 3:
+                        continue_flag = 1
+                        break
+                if continue_flag:
+                    continue
+
             # 先调用get_html才能更新error_code
             # 先调用get_html才能更新error_code
             child_html_text = child.get_html()
             child_html_text = child.get_html()
+            # print('sort child_html_text', child_html_text)
             if child.error_code is not None:
             if child.error_code is not None:
                 self.error_code = child.error_code
                 self.error_code = child.error_code
                 return ""
                 return ""
@@ -158,14 +214,16 @@ class _Image:
         else:
         else:
             self.error_code = child.error_code
             self.error_code = child.error_code
 
 
-    def get_html(self):
+    def get_html(self, return_children=False):
         # 将Image转为Sentence,table
         # 将Image转为Sentence,table
         self.convert()
         self.convert()
         # if self.error_code == [-16]:
         # if self.error_code == [-16]:
         #     self.error_code = None
         #     self.error_code = None
         #     return "<div>#idc error#<div>"
         #     return "<div>#idc error#<div>"
         if self.error_code is not None:
         if self.error_code is not None:
-            return ""
+            return self.error_code
+        if return_children:
+            return self.children
 
 
         html_text = ""
         html_text = ""
         self.children = sort_object(self.children)
         self.children = sort_object(self.children)
@@ -192,7 +250,9 @@ class _Image:
                                  self.b_table_layout_size, self.is_reverse)
                                  self.b_table_layout_size, self.is_reverse)
         if judge_error_code(obj_list):
         if judge_error_code(obj_list):
             # 20241101 注释 图片识别报错返回空
             # 20241101 注释 图片识别报错返回空
-            # self.error_code = obj_list
+            # 20250604 不是来源pdf的,返回错误码
+            if not self.is_from_pdf:
+                self.error_code = obj_list
             return
             return
 
 
         if self.b_table_from_text:
         if self.b_table_from_text:
@@ -213,9 +273,19 @@ class _Table:
         self.bbox = bbox
         self.bbox = bbox
         self.x = bbox[0]
         self.x = bbox[0]
         self.y = bbox[1]
         self.y = bbox[1]
-        self.shape = (len(content), len(content[0]))
+        if len(content) and len(content[0]):
+            self.shape = (len(content), len(content[0]))
+        else:
+            self.shape = (0, 0)
         self.error_code = None
         self.error_code = None
 
 
+    def get_table_bbox(self, table):
+        x1 = min([y.bbox[0] for x in table for y in x])
+        y1 = min([y.bbox[1] for x in table for y in x])
+        x2 = max([y.bbox[2] for x in table for y in x])
+        y2 = max([y.bbox[3] for x in table for y in x])
+        return [x1, y1, x2, y2]
+
     def get_html(self):
     def get_html(self):
         if self.error_code is not None:
         if self.error_code is not None:
             return ""
             return ""
@@ -227,6 +297,9 @@ class _Table:
             html_text = get_table_html(self.content)
             html_text = get_table_html(self.content)
             return html_text
             return html_text
 
 
+    def __repr__(self):
+        return '(%s@#@%s)' % (str('table'), '@'.join([str(x) for x in self.bbox]))
+
 
 
 class _Sentence:
 class _Sentence:
     def __init__(self, content, bbox, is_html=False):
     def __init__(self, content, bbox, is_html=False):
@@ -249,6 +322,9 @@ class _Sentence:
         else:
         else:
             return add_div(self.content)
             return add_div(self.content)
 
 
+    def __repr__(self):
+        return '(%s@#@%s)' % (str(self.content), '@'.join([str(x) for x in self.bbox]))
+
 
 
 class TextBox:
 class TextBox:
     def __init__(self, bbox, text):
     def __init__(self, bbox, text):
@@ -261,6 +337,17 @@ class TextBox:
     def __str__(self):
     def __str__(self):
         return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))
         return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))
 
 
+    def __repr__(self):
+        return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))
+
+    def __hash__(self):
+        return hash(self.__str__())
+
+    def __eq__(self, other):
+        if isinstance(other, TextBox):
+            return self.__str__() == other.__str__()
+        return False
+
 
 
 class TableLine:
 class TableLine:
     def __init__(self, bbox):
     def __init__(self, bbox):

+ 61 - 0
format_convert/convert_wps.py

@@ -0,0 +1,61 @@
+import os
+import re
+import sys
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
+from format_convert.convert_tree import _Document, _Sentence, _Page
+import logging
+import traceback
+from format_convert.convert_doc import DocConvert
+from format_convert.utils import judge_error_code, get_logger, log
+
+
+class WpsConvert:
+    def __init__(self, path, unique_type_dir):
+        self._doc = _Document(path)
+        self.path = path
+        self.unique_type_dir = unique_type_dir
+
+    def convert(self):
+        # 改后缀,调用doc处理
+        print('self.path', self.path)
+        file_name = re.split('[/\\\]', self.path)[-1]
+        with open(self.path, 'rb') as file:
+            content = file.read()
+
+        new_file_name = file_name[:-4] + '.doc'
+        new_file_path = self.unique_type_dir + new_file_name
+        print('new_file_path', new_file_path)
+        with open(new_file_path, 'wb') as file:
+            file.write(content)
+
+        log('wps file ' + file_name + ' -> ' + new_file_name)
+
+        self._doc_convert = DocConvert(new_file_path, self.unique_type_dir)
+        self._doc_convert.convert()
+        self._doc = self._doc_convert._doc
+
+    def get_html(self):
+        try:
+            self.convert()
+        except:
+            traceback.print_exc()
+            self._doc.error_code = [-1]
+
+        # 直接返回doc处理的html
+        if self._doc.error_code is not None:
+            return self._doc.error_code
+        else:
+            return self._doc.get_html()
+
+
+if __name__ == '__main__':
+    _p = "C:/Users/Administrator/Downloads/1723004790329.wps"
+    # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
+    c = WpsConvert(_p, save_dir)
+    _html = c.get_html()
+    with open('../result.html', 'w', encoding='utf-8') as f:
+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])
+
+

+ 6 - 0
format_convert/easyofd/easyofd/__init__.py

@@ -0,0 +1,6 @@
+from .ofd import OFD
+__version__ = "0.5.1"
+__author__ = "renoyuan"
+__email__ = "renoyuan@foxmail.com"
+__description__ = "一个用于OFD文档处理的Python库"
+__all__ = ["OFD"]

+ 474 - 0
format_convert/easyofd/easyofd/chinese_characters.txt

@@ -0,0 +1,474 @@
+豈
+更
+車
+賈
+滑
+串
+句
+龜
+龜
+契
+金
+喇
+奈
+懶
+癩
+羅
+蘿
+螺
+裸
+邏
+樂
+洛
+烙
+珞
+落
+酪
+駱
+亂
+卵
+欄
+爛
+蘭
+鸞
+嵐
+濫
+藍
+襤
+拉
+臘
+蠟
+廊
+朗
+浪
+狼
+郎
+來
+冷
+勞
+擄
+櫓
+爐
+盧
+老
+蘆
+虜
+路
+露
+魯
+鷺
+碌
+祿
+綠
+菉
+錄
+鹿
+論
+壟
+弄
+籠
+聾
+牢
+磊
+賂
+雷
+壘
+屢
+樓
+淚
+漏
+累
+縷
+陋
+勒
+肋
+凜
+凌
+稜
+綾
+菱
+陵
+讀
+拏
+樂
+諾
+丹
+寧
+怒
+率
+異
+北
+磻
+便
+復
+不
+泌
+數
+索
+參
+塞
+省
+葉
+說
+殺
+辰
+沈
+拾
+若
+掠
+略
+亮
+兩
+凉
+梁
+糧
+良
+諒
+量
+勵
+呂
+女
+廬
+旅
+濾
+礪
+閭
+驪
+麗
+黎
+力
+曆
+歷
+轢
+年
+憐
+戀
+撚
+漣
+煉
+璉
+秊
+練
+聯
+輦
+蓮
+連
+鍊
+列
+劣
+咽
+烈
+裂
+說
+廉
+念
+捻
+殮
+簾
+獵
+令
+囹
+寧
+嶺
+怜
+玲
+瑩
+羚
+聆
+鈴
+零
+靈
+領
+例
+禮
+醴
+隸
+惡
+了
+僚
+寮
+尿
+料
+樂
+燎
+療
+蓼
+遼
+龍
+暈
+阮
+劉
+杻
+柳
+流
+溜
+琉
+留
+硫
+紐
+類
+六
+戮
+陸
+倫
+崙
+淪
+輪
+律
+慄
+栗
+率
+隆
+利
+吏
+履
+易
+李
+梨
+泥
+理
+痢
+罹
+裏
+裡
+里
+離
+匿
+溺
+吝
+燐
+璘
+藺
+隣
+鱗
+麟
+林
+淋
+臨
+立
+笠
+粒
+狀
+炙
+識
+什
+茶
+刺
+切
+度
+拓
+糖
+宅
+洞
+暴
+輻
+行
+降
+見
+廓
+兀
+嗀
+﨎
+﨏
+塚
+﨑
+晴
+﨓
+﨔
+凞
+猪
+益
+礼
+神
+祥
+福
+靖
+精
+羽
+﨟
+蘒
+﨡
+諸
+﨣
+﨤
+逸
+都
+﨧
+﨨
+﨩
+飯
+飼
+館
+鶴
+郞
+隷
+侮
+僧
+免
+勉
+勤
+卑
+喝
+嘆
+器
+塀
+墨
+層
+屮
+悔
+慨
+憎
+懲
+敏
+既
+暑
+梅
+海
+渚
+漢
+煮
+爫
+琢
+碑
+社
+祉
+祈
+祐
+祖
+祝
+禍
+禎
+穀
+突
+節
+練
+縉
+繁
+署
+者
+臭
+艹
+艹
+著
+褐
+視
+謁
+謹
+賓
+贈
+辶
+逸
+難
+響
+頻
+恵
+𤋮
+舘
+﩮
+﩯
+並
+况
+全
+侀
+充
+冀
+勇
+勺
+喝
+啕
+喙
+嗢
+塚
+墳
+奄
+奔
+婢
+嬨
+廒
+廙
+彩
+徭
+惘
+慎
+愈
+憎
+慠
+懲
+戴
+揄
+搜
+摒
+敖
+晴
+朗
+望
+杖
+歹
+殺
+流
+滛
+滋
+漢
+瀞
+煮
+瞧
+爵
+犯
+猪
+瑱
+甆
+画
+瘝
+瘟
+益
+盛
+直
+睊
+着
+磌
+窱
+節
+类
+絛
+練
+缾
+者
+荒
+華
+蝹
+襁
+覆
+視
+調
+諸
+請
+謁
+諾
+諭
+謹
+變
+贈
+輸
+遲
+醙
+鉶
+陼
+難
+靖
+韛
+響
+頋
+頻
+鬒
+龜
+𢡊
+𢡄
+𣏕
+㮝
+䀘
+䀹
+𥉉
+𥳐
+𧻓
+齃
+龎

+ 23 - 0
format_convert/easyofd/easyofd/draw/__init__.py

@@ -0,0 +1,23 @@
+import os
+import sys
+
+from reportlab.pdfbase import pdfmetrics
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../../../../")
+from format_convert.easyofd.easyofd.parser_ofd import *
+
+FONTS = ['宋体',"SWPMEH+SimSun",'SimSun','KaiTi','楷体',"STKAITI","SWLCQE+KaiTi",
+         'Courier New','STSong-Light',"CourierNew","SWANVV+CourierNewPSMT",
+         "CourierNewPSMT","BWSimKai","hei","黑体","SimHei","SWDKON+SimSun",
+         "SWCRMF+CourierNewPSMT","SWHGME+KaiTi"]
+
+from .font_tools import FontTool
+from .draw_pdf import DrawPDF
+from .draw_ofd import OFDWrite
+
+
+
+
+
+
+    

+ 290 - 0
format_convert/easyofd/easyofd/draw/draw_ofd.py

@@ -0,0 +1,290 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: F:\code\easyofd\easyofd\draw
+# CREATE_TIME: 2023-10-26
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# note:  写入 xml 目录并打包成ofd 文件
+from datetime import datetime
+from io import BytesIO
+from typing import Optional
+
+from PIL import Image
+from loguru import logger
+
+from .ofdtemplate import CurId, OFDTemplate, DocumentTemplate, DocumentResTemplate, PublicResTemplate, ContentTemplate, \
+    OFDStructure
+from .pdf_parse import DPFParser
+
+
+class OFDWrite(object):
+    """
+    写入ofd 工具类
+    """
+
+    def __init__(self, ):
+        self.OP = 200 / 25.4
+        # self.OP = 1
+
+    def build_ofd_entrance(self, id_obj: Optional[CurId] = None):
+        """
+        build_ofd_entrance
+        """
+        CreationDate = str(datetime.now())
+        ofd_entrance = OFDTemplate(CreationDate=CreationDate, id_obj=id_obj)
+        return ofd_entrance
+
+    def build_document(self, img_len, id_obj: Optional[CurId] = None, PhysicalBox: Optional[str] = "0 0 140 90"):
+        """
+        build_document
+        """
+        pages = []
+
+        for idx in range(img_len):
+            pages.append(
+                {
+                    "@ID": f"{idx + 1}",
+                    "@BaseLoc": f"Pages/Page_{idx}/Content.xml"
+                }
+            )
+        document = DocumentTemplate(Page=pages, id_obj=id_obj, PhysicalBox=PhysicalBox)
+        return document
+
+    def build_document_res(self, img_len: int = 0, id_obj: Optional[CurId] = None,
+                           pfd_res_uuid_map: Optional[dict] = None):
+        """
+        build_document_res
+        """
+        MultiMedia = []
+        DrawParams = []  # todo DrawParams 参数后面有空增加
+        pfd_img = None
+        if pfd_res_uuid_map:
+            pfd_img = pfd_res_uuid_map.get("img")
+
+        if img_len and not pfd_res_uuid_map:
+            for num in range(img_len):
+                MultiMedia.append({
+                    "@ID": 0,
+                    "@Type": "Image",
+                    "ofd:MediaFile": f"Image_{num}.jpg",
+                    "res_uuid": f"{num}",
+                })
+        elif pfd_res_uuid_map and pfd_img:
+            for res_uuid in pfd_img.keys():
+                name = f"Image_{res_uuid}.jpg"
+                MultiMedia.append({
+                    "@ID": 0,
+                    "@Type": "Image",
+                    "ofd:MediaFile": name,
+                    "res_uuid": res_uuid,
+
+                })
+
+        document_res = DocumentResTemplate(MultiMedia=MultiMedia, id_obj=id_obj)
+        return document_res
+
+    def build_public_res(self, id_obj: CurId = None, pfd_res_uuid_map: dict = None):
+        """
+        build_public_res
+        """
+        fonts = []
+
+        pfd_font = None
+        if pfd_res_uuid_map:
+            pfd_font = pfd_res_uuid_map.get("font")
+
+        if pfd_res_uuid_map and pfd_font:
+            for res_uuid, font in pfd_font.items():
+                fonts.append({
+                    "@ID": 0,
+                    "@FontName": font,
+                    "@FamilyName": font,  # 匹配替代字型
+                    "res_uuid": res_uuid,
+                    "@FixedWidth": "false",
+                    "@Serif": "false",
+                    "@Bold": "false",
+                    "@Charset": "prc"
+                })
+        else:
+            pass
+
+        public_res = PublicResTemplate(Font=fonts, id_obj=id_obj)
+        return public_res
+
+    def build_content_res(self, pil_img_list=None, pdf_info_list=None, id_obj: CurId = None,
+                          pfd_res_uuid_map: dict = None):
+        """
+        pil_img_list - >一张图片是一页
+        content_res -> 写入 pdf 信息
+        """
+        PhysicalBox = None
+        content_res_list = []
+        if pil_img_list:
+            for idx, pil_img in enumerate(pil_img_list):
+                # print(pil_img)
+                # print(idx, pil_img[1], pil_img[2])
+                PhysicalBox = f"0 0 {pil_img[1]} {pil_img[2]}"
+                ImageObject = [{
+                    "@ID": 0,
+                    "@CTM": f"{pil_img[1]} 0 0 {pil_img[2]} 0 0",
+                    "@Boundary": f"0 0 {pil_img[1]} {pil_img[2]}",
+                    "res_uuid": f"{idx}",  # 资源标识
+                    "@ResourceID": f""
+                }]
+
+                conten = ContentTemplate(PhysicalBox=PhysicalBox, ImageObject=ImageObject,
+
+                                         CGTransform=[], PathObject=[], TextObject=[], id_obj=id_obj)
+                # print(conten)
+                content_res_list.append(conten)
+        elif pdf_info_list:  # 写入读取后的pdf 结果 # todo 图片id 需要关联得提前定义或者有其他方式反向对齐
+
+            for idx, content in enumerate(pdf_info_list):
+                ImageObject = []
+                TextObject = []
+                PhysicalBox = pfd_res_uuid_map["other"]["page_size"][idx]
+                PhysicalBox = f"0 0 {PhysicalBox[0]} {PhysicalBox[1]}"  # page_size 没有的话使用document 里面的
+                for block in content:
+                    # print(block)
+
+                    bbox = block['bbox']
+                    x0, y0, length, height = bbox[0] / self.OP, bbox[1] / self.OP, (bbox[2] - bbox[0]) / self.OP, (
+                            bbox[3] - bbox[1]) / self.OP
+                    if block["type"] == "text":
+
+                        count = len(block.get("text"))
+
+                        TextObject.append({
+                            "@ID": 0,
+                            "res_uuid": block.get("res_uuid"),  # 资源标识
+                            "@Font": "",
+                            "ofd:FillColor": {"Value": "156 82 35"},
+
+                            "ofd:TextCode": {
+                                "#text": block.get("text"),
+                                "@X": "0",
+                                "@Y": f"{block.get('size') / self.OP}",
+                                "@DeltaX": f"g {count - 1} {length / count}"
+                            },
+
+                            "@size": block.get("size") / self.OP,
+                            "@Boundary": f"{x0} {y0} {length} {height}",
+
+                        })
+                    elif block["type"] == "img":
+                        ImageObject.append({
+                            "@ID": 0,
+                            "res_uuid": block.get("res_uuid"),  # 资源标识
+
+                            "@Boundary": f"{x0} {y0} {length} {height}",
+                            "@ResourceID": f""  # 需要关联public res 里面的结果
+
+                        })
+
+                # for i in content:
+                #     if i["type"] == "img":
+                #         ImageObject.append(i)
+                #     elif i["type"] == "text":
+                #         TextObject.append(i)
+
+                conten = ContentTemplate(PhysicalBox=PhysicalBox, ImageObject=ImageObject,
+
+                                         CGTransform=[], PathObject=[], TextObject=TextObject, id_obj=id_obj)
+                # print(conten)
+                content_res_list.append(conten)
+        else:
+            pass
+        return content_res_list
+
+    def pil_2_bytes(self, image):
+        """"""
+        # 创建一个 BytesIO 对象
+        img_bytesio = BytesIO()
+
+        # 将图像保存到 BytesIO 对象
+        image.save(img_bytesio, format='PNG')  # 你可以根据需要选择其他图像格式
+
+        # 获取 BytesIO 对象中的字节
+        img_bytes = img_bytesio.getvalue()
+
+        # 关闭 BytesIO 对象
+        img_bytesio.close()
+        return img_bytes
+
+    def __call__(self, pdf_bytes=None, pil_img_list=None, optional_text=False):
+        """
+        input pdf | imgs if pdf  >optional_text or not
+        0 解析pdf文件
+        1 构建必要的ofd template
+        2 转化为 ofd
+        """
+        pdf_obj = DPFParser()
+        page_pil_img_list = None
+
+        # 插入图片ofd
+        if pil_img_list:  # 读取 图片
+            page_pil_img_list = [(self.pil_2_bytes(_img), _img.size[0] / self.OP, _img.size[1] / self.OP) for _img in
+                                 pil_img_list]
+        else:  # 读取 pdf 转图片
+            if optional_text:  # 生成可编辑ofd:
+                pdf_info_list, pfd_res_uuid_map = pdf_obj.extract_text_with_details(pdf_bytes)  # 解析pdf
+                # logger.debug(f"pdf_info_list: {pdf_info_list} \n pfd_res_uuid_map {pfd_res_uuid_map}")
+            else:
+                img_list = pdf_obj.to_img(pdf_bytes)
+                page_pil_img_list = [(self.pil_2_bytes(Image.frombytes("RGB", [_img.width, _img.height],
+                                                                       _img.samples)), _img.width / self.OP,
+                                      _img.height / self.OP) for _img in img_list]
+
+        id_obj = CurId()
+
+        if page_pil_img_list:  # img 内容转ofd
+            res_static = {}  # 图片资源
+            pfd_res_uuid_map = {"img": {}}
+            PhysicalBox = f"0 0 {page_pil_img_list[0][1]} {page_pil_img_list[0][2]}"
+            for idx, pil_img_tuple in enumerate(page_pil_img_list):
+                pfd_res_uuid_map["img"][f"{idx}"] = pil_img_tuple[0]
+                res_static[f"Image_{idx}.jpg"] = pil_img_tuple[0]
+            ofd_entrance = self.build_ofd_entrance(id_obj=id_obj)
+            document = self.build_document(len(page_pil_img_list), id_obj=id_obj, PhysicalBox=PhysicalBox)
+            public_res = self.build_public_res(id_obj=id_obj)
+            document_res = self.build_document_res(len(page_pil_img_list), id_obj=id_obj,
+                                                   pfd_res_uuid_map=pfd_res_uuid_map)
+
+            content_res_list = self.build_content_res(page_pil_img_list, id_obj=id_obj,
+                                                      pfd_res_uuid_map=pfd_res_uuid_map)
+
+
+        else:
+            #  生成的文档结构对象需要传入id实例
+            ofd_entrance = self.build_ofd_entrance(id_obj=id_obj)
+            document = self.build_document(len(pdf_info_list), id_obj=id_obj)
+            public_res = self.build_public_res(id_obj=id_obj, pfd_res_uuid_map=pfd_res_uuid_map)
+            document_res = self.build_document_res(len(pdf_info_list), id_obj=id_obj, pfd_res_uuid_map=pfd_res_uuid_map)
+            content_res_list = self.build_content_res(pdf_info_list=pdf_info_list, id_obj=id_obj,
+                                                      pfd_res_uuid_map=pfd_res_uuid_map)
+
+            res_static = {}  # 图片资源
+
+            print("pfd_res_uuid_map", pfd_res_uuid_map)
+            img_dict = pfd_res_uuid_map.get("img")
+            if img_dict:
+                for key, v_io in img_dict.items():
+                    res_static[f"Image_{key}.jpg"] = v_io.getvalue()
+
+        # 生成 ofd 文件
+        ofd_byte = OFDStructure("123", ofd=ofd_entrance, document=document, public_res=public_res,
+                                document_res=document_res, content_res=content_res_list, res_static=res_static)(
+            test=True)
+        return ofd_byte
+
+
+if __name__ == "__main__":
+    pdf_p = r"D:\renodoc\技术栈\GBT_33190-2016_电子文件存储与交换格式版式文档.pdf"
+    pdf_p = r"F:\code\easyofd\test"
+    with open(pdf_p, "rb") as f:
+        content = f.read()
+
+    ofd_content = OFDWrite()(content)
+
+    with open("ofd.ofd", "wb") as f:
+        f.write(ofd_content)

+ 1178 - 0
format_convert/easyofd/easyofd/draw/draw_pdf.py

@@ -0,0 +1,1178 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: E:\code\easyofd\easyofd\draw
+# CREATE_TIME: 2023-08-10
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE:  绘制pdf
+import base64
+import math
+import os
+import re
+import sys
+import traceback
+from io import BytesIO
+
+from PIL import Image as PILImage, Image, ImageFont, ImageDraw
+from fontTools.ttLib import TTFont
+from loguru import logger
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.utils import ImageReader
+from reportlab.pdfgen import canvas
+
+from format_convert.utils import special_font_to_normal, image_resize_by_ratio
+
+sys.path.append(os.path.dirname(__file__) + "/../../../../")
+from format_convert.easyofd.easyofd.draw.font_tools import FontTool
+from .find_seal_img import SealExtract
+
+
+# print(reportlab_fonts)
+class DrawPDF():
+    """
+    ofd 解析结果 绘制pdf
+    OP ofd 单位转换
+    """
+
+    def __init__(self, data, *args, **kwargs):
+        assert data, "未输入ofd解析结果"
+        self.data = data
+        self.author = "renoyuan"
+        self.OP = 200 / 25.4
+        # self.OP = 1
+        self.pdf_uuid_name = self.data[0]["pdf_name"]
+        self.pdf_io = BytesIO()
+        self.SupportImgType = ("JPG", "JPEG", "PNG")
+        self.init_font = "宋体"
+        self.font_tool = FontTool()
+        self.page_need_to_image_dict = {}
+
+    def draw_lines(my_canvas):
+        """
+        draw_line
+        """
+        my_canvas.setLineWidth(.3)
+
+        start_y = 710
+        my_canvas.line(30, start_y, 580, start_y)
+
+        for x in range(10):
+            start_y -= 10
+            my_canvas.line(30, start_y, 580, start_y)
+
+    def gen_empty_pdf(self):
+        """
+        """
+        c = canvas.Canvas(self.pdf_io)
+        c.setPageSize(A4)
+        c.setFont(self.init_font, 20)
+        c.drawString(0, 210, "ofd 格式错误,不支持解析", mode=1)
+        c.save()
+
+    # 单个字符偏移量计算
+    def cmp_offset(self, pos, offset, DeltaRule, text, CTM_info, dire="X") -> list:
+        """
+        pos 文本框x|y 坐标 
+        offset 第一个字符的X|Y 
+        DeltaRule 偏移量规则
+        resize 字符坐标缩放
+        返回 x|y  字符位置list 
+        """
+        if CTM_info and dire == "X":
+            resize = CTM_info.get("resizeX")
+            rotate = CTM_info.get("rotateX")
+            move = CTM_info.get("moveX")
+        elif CTM_info and dire == "Y":
+            resize = CTM_info.get("resizeY")
+            rotate = CTM_info.get("rotateY")
+            move = CTM_info.get("moveY")
+        else:
+            resize = 1
+            rotate = 0
+            move = 0
+
+        # print(f"resize is {resize}")
+        char_pos = float(pos if pos else 0) + (float(offset if offset else 0) + move) * resize
+        pos_list = []
+        pos_list.append(char_pos)  # 放入第一个字符
+        offsets = [i for i in DeltaRule.split(" ")]
+
+        if "g" in DeltaRule:  # g 代表多个元素
+            g_no = None
+            for _no, offset_i in enumerate(offsets):
+
+                if offset_i == "g":
+                    g_no = _no
+                    for j in range(int(offsets[(g_no + 1)])):
+                        char_pos += float(offsets[(g_no + 2)])
+                        pos_list.append(char_pos)
+
+                elif offset_i and offset_i != "g":
+                    if g_no == None:
+                        char_pos += float(offset_i) * resize
+                        pos_list.append(char_pos)
+                    elif (int(_no) > int(g_no + 2)) and g_no != None:
+                        # print(f"offset_i is {offset_i}")
+                        char_pos += float(offset_i) * resize
+                        pos_list.append(char_pos)
+
+        elif not DeltaRule:  # 没有字符偏移量 一般单字符
+            pos_list = []
+            for i in range(len(text)):
+                pos_list.append(char_pos)
+        else:  # 有字符偏移量
+            for i in offsets:
+                if not i:
+                    char_pos += 0
+                else:
+                    char_pos += float(i) * resize
+                pos_list.append(char_pos)
+
+        return pos_list
+
+    def draw_chars_old(self, canvas, text_list, fonts, page_size):
+        """写入字符"""
+        c = canvas
+        for line_dict in text_list:
+            # TODO 写入前对于正文内容整体序列化一次 方便 查看最后输入值 对于最终 格式先
+            text = line_dict.get("text")
+            # font_info = fonts.get(line_dict.get("font"), {})
+            # if font_info:
+            #     font_name = font_info.get("FontName", "")
+            # else:
+            #     font_name = self.init_font
+            # print(f"font_name:{font_name}")
+
+            # TODO 判断是否通用已有字体 否则匹配相近字体使用
+            # if font_name not in self.font_tool.FONTS:
+            #     font_name = self.font_tool.FONTS[0]
+            font_name = self.init_font
+
+            font = self.font_tool.normalize_font_name(font_name)
+            # print(f"font_name:{font_name} font:{font}")
+
+            try:
+                c.setFont(font, line_dict["size"] * self.OP)
+            except KeyError as key_error:
+                logger.error(f"{key_error}")
+                font = self.font_tool.FONTS[0]
+                c.setFont(font, line_dict["size"] * self.OP)
+            # 原点在页面的左下角 
+            color = line_dict.get("color", [0, 0, 0])
+            if len(color) < 3:
+                color = [0, 0, 0]
+
+            c.setFillColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
+            c.setStrokeColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
+
+            DeltaX = line_dict.get("DeltaX", "")
+            DeltaY = line_dict.get("DeltaY", "")
+            # print("DeltaX",DeltaX)
+            X = line_dict.get("X", "")
+            Y = line_dict.get("Y", "")
+            CTM = line_dict.get("CTM", "")  # 因为ofd 增加这个字符缩放
+            resizeX = 1
+            resizeY = 1
+            # CTM =None # 有的数据不使用这个CTM
+            CTMS = None
+            if CTM:
+                CTMS = CTM.split(" ")
+
+            if CTM and CTMS and len(CTMS) == 6:
+                CTM_info = {
+                    "resizeX": float(CTMS[0]),
+                    "rotateX": float(CTMS[1]),
+                    "rotateY": float(CTMS[2]),
+                    "resizeY": float(CTMS[3]),
+                    "moveX": float(CTMS[4]),
+                    "moveY": float(CTMS[5]),
+
+                }
+
+            else:
+                CTM_info = {}
+            x_list = self.cmp_offset(line_dict.get("pos")[0], X, DeltaX, text, CTM_info, dire="X")
+            y_list = self.cmp_offset(line_dict.get("pos")[1], Y, DeltaY, text, CTM_info, dire="Y")
+
+            # print("x_list",x_list)
+            # print("y_list",y_list)
+            # print("Y",page_size[3])
+            # print("x",page_size[2])
+            # if line_dict.get("Glyphs_d") and  FontFilePath.get(line_dict["font"])  and font_f not in FONTS:
+            if False:  # 对于自定义字体 写入字形 drawPath 性能差暂时作废
+                Glyphs = [int(i) for i in line_dict.get("Glyphs_d").get("Glyphs").split(" ")]
+                for idx, Glyph_id in enumerate(Glyphs):
+                    _cahr_x = float(x_list[idx]) * self.OP
+                    _cahr_y = (float(page_size[3]) - (float(y_list[idx]))) * self.OP
+                    imageFile = draw_Glyph(FontFilePath.get(line_dict["font"]), Glyph_id, text[idx])
+
+                    # font_img_info.append((FontFilePath.get(line_dict["font"]), Glyph_id,text[idx],_cahr_x,_cahr_y,-line_dict["size"]*Op*2,line_dict["size"]*Op*2))
+                    c.drawImage(imageFile, _cahr_x, _cahr_y, -line_dict["size"] * self.OP * 2,
+                                line_dict["size"] * self.OP * 2)
+            else:
+                if len(text) > len(x_list) or len(text) > len(y_list):
+                    text = re.sub("[^\u4e00-\u9fa5]", "", text)
+                try:
+                    # 按行写入  最后一个字符y  算出来大于 y轴  最后一个字符x  算出来大于 x轴 
+                    if y_list[-1] * self.OP > page_size[3] * self.OP or x_list[-1] * self.OP > page_size[2] * self.OP or \
+                            x_list[-1] < 0 or y_list[-1] < 0:
+                        # if True:
+                        # print("line wtite")
+                        x_p = abs(float(X)) * self.OP
+                        y_p = abs(float(page_size[3]) - (float(Y))) * self.OP
+                        print('text, x_p, y_p', text, x_p, y_p)
+                        c.drawString(x_p, y_p, text, mode=0)  # mode=3 文字不可见 0可見
+
+                        # text_write.append((x_p,  y_p, text))
+                    # 按字符写入
+                    else:
+                        for char_id, _char in enumerate(text):
+                            if len(x_list) > char_id:
+                                # print("char wtite")
+                                font_size = line_dict["size"] * self.OP * resizeX
+                                c.setFont(font, line_dict["size"] * self.OP * resizeX)
+                                _char_x = float(x_list[char_id]) * self.OP
+                                _char_y = (float(page_size[3]) - (float(y_list[char_id]))) * self.OP
+                                # print(_cahr_x,  _cahr_y, _cahr_)
+                                print('_cahr_, _char_x, _char_y', _char, _char_x, _char_y, font_size)
+                                c.drawString(_char_x, _char_y, _char, mode=0)  # mode=3 文字不可见 0可見
+                                break
+                            else:
+                                pass
+                                # logger.debug(f"match {_cahr_} pos error \n{text} \n{x_list}")
+                            # text_write.append((_cahr_x,  _cahr_y, _cahr_))
+                        break
+                except Exception as e:
+                    logger.error(f"{e}")
+                    traceback.print_exc()
+
+    def draw_chars(self, canvas, text_list, fonts, page_size, pdf_page_size):
+        """写入字符"""
+        for line_dict in text_list:
+            # TODO 写入前对于正文内容整体序列化一次 方便 查看最后输入值 对于最终 格式先
+            # print('line_dict', line_dict)
+            text = line_dict.get("text")
+            text_size = line_dict.get("size")
+            if not text_size:
+                print('draw_chars not text_size', text)
+                return
+
+            # 变换矩阵
+            ctm = line_dict.get("CTM", '')
+            ctm = self.get_ctm(ctm)
+            a, b, c, d, e, f = ctm
+            # 计算水平和垂直方向的缩放因子的平均值
+            font_scale = (a + d) / 2
+
+            color = line_dict.get("color", [0, 0, 0])
+            if len(color) < 3:
+                color = [0, 0, 0]
+            canvas.setFillColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
+            # c.setStrokeColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
+
+            # 文本框范围
+            boundary = line_dict.get("pos")
+            if len(boundary) != 4:
+                print('draw_chars not boundary', text, boundary)
+                return
+            left, top, width, height = boundary
+
+            # 根据delta_x判断有重复文本
+            delta_x = line_dict.get("DeltaX", "")
+            delta_y = line_dict.get("DeltaY", "")
+            g_cnt = re.findall('g', delta_x)
+            if len(g_cnt) >= 2:
+                split_index = len(text) / 2
+                if text[:int(split_index)] == text[int(split_index):]:
+                    text2 = text[:int(split_index)]
+                    print('len(g_cnt) >= 2', g_cnt, text, '->', text2)
+                    text = text2
+
+            # 文字相对与boundary的偏移
+            x = line_dict.get("X", "")
+            y = line_dict.get("Y", "")
+            if "" in [x, y]:
+                print('draw_chars not x or not y', text, x, y)
+                return
+            x, y = float(x) * font_scale, float(y) * font_scale
+
+            font_name = self.init_font
+            font = self.font_tool.normalize_font_name(font_name)
+
+            # boundary, x, y 计算实际坐标
+            actual_left = left + x
+            actual_right = actual_left + width
+            actual_top = top + y
+            actual_bottom = actual_top + y
+
+            # print('actual_left, actual_top', text, actual_left, actual_top)
+
+            # ctm, text_size 计算字体大小
+            actual_size = text_size * font_scale
+
+            canvas.setFont(font, actual_size * self.OP)
+
+            # print('actual_bottom, y', actual_bottom, y)
+            # ofd原点在左上角,pdf原点在左下角
+            try:
+                # print('text111', text, actual_left * self.OP, pdf_page_size[3] - actual_bottom * self.OP)
+                # 按行写入
+                canvas.drawString(actual_left * self.OP,
+                                  pdf_page_size[3] - actual_top * self.OP,
+                                  text, mode=0)
+
+            except Exception as e:
+                logger.error(f"{e}")
+                traceback.print_exc()
+
+    def draw_odf_char_on_image(self, line_dict, img, pos, ofd_page_size):
+        text = line_dict.get("text")
+        text_size = line_dict.get("size")
+        if not text_size:
+            print('get_odf_char_info not text_size', text)
+            return
+
+        # 变换矩阵
+        ctm = line_dict.get("CTM", '')
+        ctm = self.get_ctm(ctm)
+        a, b, c, d, e, f = ctm
+        # 计算水平和垂直方向的缩放因子的平均值
+        font_scale = (a + d) / 2
+
+        color = line_dict.get("color", [0, 0, 0])
+        if len(color) < 3:
+            color = (0, 0, 0)
+        else:
+            color = tuple([int(x) for x in color])
+        # print('color', color)
+
+        # 文本框范围
+        boundary = line_dict.get("pos")
+        if len(boundary) != 4:
+            print('get_odf_char_info not boundary', text, boundary)
+            return
+        left, top, width, height = boundary
+
+        # 文字相对与boundary的偏移,y小于size的话会显示不完全
+        x = line_dict.get("X", "")
+        y = line_dict.get("Y", "")
+        # print('x, y', x, y)
+        if "" in [x, y]:
+            print('get_odf_char_info not x or not y', text, x, y)
+            return
+        x, y = float(x) * a, float(y) * d
+
+        # boundary, x, y 计算实际坐标
+        actual_left = left
+        actual_right = actual_left + x
+        actual_top = top
+        actual_bottom = actual_top + y
+
+        # print('actual_left', actual_left, ofd_page_size[2], pos[2])
+        # actual_left = actual_left / ofd_page_size[2] * pos[2]
+        # print('actual_left2', actual_left, ofd_page_size[2], pos[2])
+        # actual_top = actual_top / ofd_page_size[3] * pos[3]
+
+        # actual_bottom = bottom + y
+        # actual_top = actual_bottom + y
+
+        # print('actual_left, actual_top', text, actual_left, actual_top)
+
+        # ctm, text_size 计算字体大小
+        actual_size = text_size * font_scale
+        actual_size = int(actual_size * img.size[0] / pos[2])
+
+        left_top_point = [actual_left * img.size[0] / pos[2], actual_top * img.size[1] / pos[3]]
+        left_top_point = [int(x) for x in left_top_point]
+        draw = ImageDraw.Draw(img)
+        font = ImageFont.truetype(os.path.dirname(__file__) + '/simsun.ttc', actual_size)
+
+        # print('text left_top_point, actual_size', text, left_top_point, actual_size)
+        # print('img.size', img.size)
+
+        draw.text(left_top_point, text, font=font, fill=color)
+        return img
+
+    def compute_ctm(self, CTM, x1, y1, img_width, img_height):
+        """待定方法"""
+        a, b, c, d, e, f = CTM.split(" ")
+        a, b, c, d, e, f = float(a), float(b), float(c), float(d), float(e), float(f)
+        # 定义变换矩阵的元素
+
+        # 计算原始矩形的宽和高
+        x2 = x1 + img_width
+        y2 = y1 + img_height
+        print(f"ori x1 {x1} y1 {y1} x2 {x2} y2 {y2} img_width {img_width} img_height {img_height}")
+        a = a / 10
+        d = d / 10
+        # 对左上角和右下角点进行变换
+        x1_new = a * x1 + c * y1 + (e)
+        y1_new = b * x1 + d * y1 + (f)
+        x2_new = a * x2 + c * y2 + (e)
+        y2_new = b * x2 + d * y2 + (f)
+        print(f"x1_new {x1_new} y1_new {y1_new} x2_new {x2_new} y2_new {y2_new}")
+        # 计算变换后矩形的宽和高
+        w_new = x2_new - x1_new
+        h_new = y2_new - y1_new
+
+        print(f"原始矩形宽度: {img_width}, 高度: {img_height}")
+        print(f"变换后矩形宽度: {w_new}, 高度: {h_new}")
+        return x1_new, y1_new, w_new, h_new
+
+    def get_ctm(self, ctm):
+        default_ctm = (1, 0, 0, 1, 0, 0)
+        if not ctm:
+            # print('get_ctm no ctm!', ctm)
+            return default_ctm
+        ctm = ctm.split(" ")
+        if len(ctm) != 6:
+            print('get_ctm len(ctm) != 6', ctm)
+            return default_ctm
+        ctm = [float(x) for x in ctm]
+        # a, b, c, d, e, f = ctm
+        return ctm
+
+    def draw_img_old(self, canvas, img_list, images, page_size):
+        """写入图片"""
+        c = canvas
+        for img_d in img_list:
+            image = images.get(img_d["ResourceID"])
+
+            if not image or image.get("suffix").upper() not in self.SupportImgType:
+                continue
+
+            imgbyte = base64.b64decode(image.get('imgb64'))
+            if not imgbyte:
+                logger.error(f"{image['fileName']} is null")
+                continue
+
+            img = PILImage.open(BytesIO(imgbyte))
+            img_width, img_height = img.size
+            # img_width = img_width / self.OP *25.4
+            # img_height = img_height / self.OP *25.4
+            info = img.info
+            # print( f"ing info dpi {info.get('dpi')}")
+            # print(img_width, img_height)
+            imgReade = ImageReader(img)
+            CTM = img_d.get('CTM')
+            # print("CTM", CTM)
+
+            wrap_pos = image.get("wrap_pos")
+            # print("wrap_pos", wrap_pos)
+            pos = img_d.get('pos')
+            # print("pos", pos)
+            CTM = None
+            if CTM and not wrap_pos and page_size == pos:
+                x1_new, y1_new, w_new, h_new = self.compute_ctm(CTM, 0, 0, img_width, img_height)
+                pdf_pos = [pos[0] * self.OP, pos[1] * self.OP, pos[2] * self.OP, pos[3] * self.OP]
+                print(f"pos: {pos} pdf_pos: {pdf_pos}")
+
+                x1_new = (pos[0] + x1_new) * self.OP
+                y1_new = (page_size[3] - y1_new) * self.OP
+                if w_new > pdf_pos[2]:
+                    w_new = pdf_pos[2]
+                if h_new > pdf_pos[3]:
+                    h_new = pdf_pos[3]
+                print(f"写入 {x1_new} {y1_new} {w_new} {-h_new}")
+                c.drawImage(imgReade, x1_new, y1_new, w_new, -h_new, 'auto')
+            else:
+                x_offset = 0
+                y_offset = 0
+
+                x = (pos[0] + x_offset) * self.OP
+                y = (page_size[3] - (pos[1] + y_offset)) * self.OP
+                if wrap_pos:
+                    x = x + (wrap_pos[0] * self.OP)
+                    y = y - (wrap_pos[1] * self.OP)
+                    w = img_d.get('pos')[2] * self.OP
+                    h = -img_d.get('pos')[3] * self.OP
+
+                    # print(x, y, w, h)
+                    c.drawImage(imgReade, x, y, w, h, 'auto')
+                elif pos:
+                    # print(f"page_size == pos :{page_size == pos} ")
+                    x = pos[0] * self.OP
+                    y = (page_size[3] - pos[1]) * self.OP
+                    w = pos[2] * self.OP
+                    h = -pos[3] * self.OP
+
+                    # print("pos",pos[0],pos[1],pos[2]* self.OP,pos[3]* self.OP)
+                    # print(x2_new, -y2_new, w_new, h_new,)
+
+                    x, y = 0, 0
+                    w, h = img.size
+
+                    print('x, y, w, h', x, y, w, h)
+
+                    c.drawImage(imgReade, x, y, w, h, 'auto')
+                    # c.drawImage(imgReade,x2_new, -y2_new, w_new, h_new, 'auto')
+
+    def draw_img(self, canvas, img_list, images, ofd_page_size, pdf_page_size, ofd_to_pdf_ratio):
+        """写入图片"""
+        c = canvas
+        for img_d in img_list:
+            image = images.get(img_d["ResourceID"])
+            if not image or image.get("suffix").upper() not in self.SupportImgType:
+                print('img_d["ResourceID"]', img_d["ResourceID"])
+                logger.error(f"not image")
+                continue
+
+            imgbyte = base64.b64decode(image.get('imgb64'))
+            if not imgbyte:
+                logger.error(f"{image['fileName']} is null")
+                continue
+
+            img = PILImage.open(BytesIO(imgbyte))
+            info = img.info
+            # print( f"ing info dpi {info.get('dpi')}")
+            ctm = img_d.get('CTM')
+            # print("ctm", ctm)
+            pos = img_d.get('pos')
+            pdf_pos = [x * ofd_to_pdf_ratio for x in pos]
+            # print('pos', pos)
+            # print('pdf_pos', pdf_pos)
+            # print('ofd_page_size', ofd_page_size)
+            # print('pdf_page_size', pdf_page_size)
+            if pos:
+                if pos[2] <= 0.1 or pos[3] <= 0.1:
+                    print('pos[2] <= 0.1 or pos[3] <= 0.1')
+                    continue
+                x, y = pdf_pos[0], pdf_page_size[3] - pdf_pos[1] - pdf_pos[3]
+                w, h = img.size
+                ctm = ctm.split(' ')
+                ctm = [float(x) for x in ctm]
+                a, b, d, e, f, g = ctm
+                if b == 0 and d == 0:
+                    angle_deg = 0
+                else:
+                    # 计算旋转角度,考虑可能的镜像翻转
+                    angle_rad = math.atan2(b, a)
+                    angle_deg = math.degrees(angle_rad)
+                    # 调整角度到 0 到 360 度范围内
+                    angle_deg = angle_deg % 360
+                img = img.rotate(-angle_deg, expand=1)
+                img = img.resize((int(pdf_pos[2]), int(pdf_pos[3])), Image.BICUBIC)
+                img = image_resize_by_ratio(img, int(pdf_page_size[2]), int(pdf_page_size[3]))
+                # img = img.resize((int(pdf_page_size[2]), int(pdf_page_size[3])), Image.BICUBIC)
+                # img = img.rotate(180, expand=1)
+                w, h = img.size
+                # print('jb2 angle_deg, x, y, w, h', angle_deg, x, y, w, h)
+                if img.mode == 'P':
+                    img = img.convert('RGBA')
+                imgReade = ImageReader(img)
+                # print('img.size, x, y, w, h, img.mode', img.size, x, y, w, h, img.mode)
+                c.drawImage(imgReade, x, y, w, h, 'auto')
+
+    def draw_img_with_annot(self, canvas, img_list, images, annot_page_size, pdf_page_size, ofd_to_pdf_ratio, annot_page_info):
+        """写入图片"""
+        c = canvas
+        for img_d in img_list:
+            image = images.get(img_d["ResourceID"])
+            if not image or image.get("suffix").upper() not in self.SupportImgType:
+                print('img_d["ResourceID"]', img_d["ResourceID"])
+                logger.error(f"not image")
+                continue
+
+            imgbyte = base64.b64decode(image.get('imgb64'))
+            if not imgbyte:
+                logger.error(f"{image['fileName']} is null")
+                continue
+
+            img = PILImage.open(BytesIO(imgbyte))
+            ctm = img_d.get('CTM')
+            pos = img_d.get('pos')
+            pdf_pos = [x * ofd_to_pdf_ratio for x in pos]
+            if pos:
+                if pos[2] <= 0.1 or pos[3] <= 0.1:
+                    print('pos[2] <= 0.1 or pos[3] <= 0.1')
+                    continue
+                x, y = pdf_pos[0], pdf_page_size[3] - pdf_pos[1] - pdf_pos[3]
+                w, h = img.size
+                ctm = ctm.split(' ')
+                ctm = [float(x) for x in ctm]
+                a, b, d, e, f, g = ctm
+                if b == 0 and d == 0:
+                    angle_deg = 0
+                else:
+                    # 计算旋转角度,考虑可能的镜像翻转
+                    angle_rad = math.atan2(b, a)
+                    angle_deg = math.degrees(angle_rad)
+                    # 调整角度到 0 到 360 度范围内
+                    angle_deg = angle_deg % 360
+
+                img = img.rotate(-angle_deg, expand=1)
+                print('angle_deg', angle_deg)
+
+                # 画上注释文字
+                # text_list = annot_page_info
+                for text_d in annot_page_info:
+                    # print('text_d', text_d)
+                    # print('img.size', img.size)
+                    print('img pos', pos)
+                    img = self.draw_odf_char_on_image(text_d, img, pos, annot_page_size)
+
+                img = img.resize((int(pdf_pos[2]), int(pdf_pos[3])), Image.BICUBIC)
+                img = image_resize_by_ratio(img, int(pdf_page_size[2]), int(pdf_page_size[3]))
+
+                w, h = img.size
+                if img.mode == 'P':
+                    img = img.convert('RGBA')
+                imgReade = ImageReader(img)
+                c.drawImage(imgReade, x, y, w, h, 'auto')
+
+    def draw_signature(self, canvas, signatures_page_list, page_size):
+        """
+        写入签章
+            {
+            "sing_page_no": sing_page_no,
+            "PageRef": PageRef,
+            "Boundary": Boundary,
+            "SignedValue": self.file_tree(SignedValue),
+                            }
+        """
+        c = canvas
+        try:
+            if signatures_page_list:
+                # print("signatures_page_list",signatures_page_list)
+                for signature_info in signatures_page_list:
+                    image = SealExtract()(b64=signature_info.get("SignedValue"))
+                    if not image:
+                        # logger.info(f"提取不到签章图片")
+                        continue
+                    else:
+                        image_pil = image[0]
+
+                    pos = [float(i) for i in signature_info.get("Boundary").split(" ")]
+
+                    imgReade = ImageReader(image_pil)
+
+                    x = pos[0] * self.OP
+                    y = (page_size[3] - pos[1]) * self.OP
+
+                    w = pos[2] * self.OP
+                    h = -pos[3] * self.OP
+                    c.drawImage(imgReade, x, y, w, h, 'auto')
+                    # print(f"签章写入成功")
+            else:
+                # 无签章
+                pass
+        except Exception as e:
+            # print(f"签章写入失败 {e}")
+            traceback.print_exc()
+
+    def draw_line_old(self, canvas, line_list, page_size):
+        """绘制线条"""
+
+        # print("绘制",line_list)
+
+        def match_mode(Abbr: list):
+            """
+            解析AbbreviatedData
+            匹配各种线条模式
+            S 定义起始 坐标 x, y
+            M 移动到指定坐标 x, y
+            L 从当前点移动到指定点 x, y
+            Q x1 y1 x2 y2 二次贝塞尔曲线
+            B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线
+            A 到 x,y 的圆弧 并移动到 x,y  rx 长轴 ry 短轴 angle 旋转角度 large为1表示 大于180 的弧 为0时表示小于180的弧 swcpp 为1 表示顺时针旋转 0 表示逆时针旋转
+            C 当前点和SubPath自动闭合
+            """
+            relu_list = []
+            mode = ""
+            modes = ["S", "M", "L", "Q", "B", "A", "C"]
+            mode_dict = {}
+            for idx, i in enumerate(Abbr):
+                if i in modes:
+                    mode = i
+                    if mode_dict:
+                        relu_list.append(mode_dict)
+                    mode_dict = {"mode": i, "points": []}
+
+                else:
+                    mode_dict["points"].append(i)
+
+                if idx + 1 == len(Abbr):
+                    relu_list.append(mode_dict)
+            return relu_list
+
+        def assemble(relu_list: list):
+            start_point = {}
+            acticon = []
+            for i in relu_list:
+                if i.get("mode") == "M":
+                    start_point = i
+                elif i.get("mode") in ['B', "Q", 'L']:
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+            return acticon
+
+        def convert_coord(p_list, direction, page_size, pos):
+            """坐标转换ofd2pdf"""
+            new_p_l = []
+            for p in p_list:
+                if direction == "x":
+
+                    new_p = (float(pos[0]) + float(p)) * self.OP
+                else:
+                    new_p = (float(page_size[3]) - float(pos[1]) - float(p)) * self.OP
+                new_p_l.append(new_p)
+            return new_p_l
+
+        for line in line_list:
+            Abbr = line.get("AbbreviatedData").split(" ")  # AbbreviatedData 
+            color = line.get("FillColor", [0, 0, 0])
+
+            relu_list = match_mode(Abbr)
+            # TODO 组合 relu_list 1 M L 直线 2 M B*n 三次贝塞尔线 3 M Q*n 二次贝塞尔线
+
+            # print(relu_list)
+
+            acticons = assemble(relu_list)
+            pos = line.get("pos")
+            # print(color)
+            if len(color) < 3:
+                color = [0, 0, 0]
+            canvas.setStrokeColorRGB(*(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255))  # 颜色
+
+            # 设置线条宽度
+            try:
+                LineWidth = (float(line.get("LineWidth", "0.25").replace(" ", "")) if \
+                                 line.get("LineWidth", "0.25").replace(" ", "") else 0.25) * self.OP
+            except Exception as e:
+                # logger.error(f"{e}")
+                LineWidth = 0.25 * self.OP
+
+            canvas.setLineWidth(LineWidth)  # 单位为点,2 表示 2 点
+
+            for acticon in acticons:
+                if acticon.get("end_point").get("mode") == 'L':  # 直线
+                    x1, y1, x2, y2 = *acticon.get("start_point").get("points"), *acticon.get("end_point").get("points")
+                    x1, x2 = convert_coord([x1, x2], "x", page_size, pos)
+                    y1, y2 = convert_coord([y1, y2], "y", page_size, pos)
+                    # 绘制一条线 x1 y1 x2 y2
+                    canvas.line(x1, y1, x2, y2)
+
+                elif acticon.get("end_point").get("mode") == 'B':  # 三次贝塞尔线
+                    continue
+                    x1, y1, x2, y2, x3, y3, x4, y4 = *acticon.get("start_point").get("points"), *acticon.get(
+                        "end_point").get("points")
+                    x1, x2, x3, x4 = convert_coord([x1, x2, x3, x4], "x", page_size, pos)
+                    y1, y2, y3, y4 = convert_coord([y1, y2, y3, y4], "y", page_size, pos)
+                    # print(x1, y1, x2, y2, x3, y3, x4, y4)
+
+                    # 绘制三次贝塞尔线
+                    canvas.bezier(x1, y1, x2, y2, x3, y3, x4, y4)
+
+                elif acticon.get("end_point").get("mode") == 'Q':  # 二次贝塞尔线
+                    pass
+                else:
+                    continue
+
+    def draw_line_old_250619(self, canvas, line_list, page_size):
+        def match_mode(Abbr: list):
+            """
+            解析AbbreviatedData
+            匹配各种线条模式
+            S 定义起始 坐标 x, y
+            M 移动到指定坐标 x, y
+            L 从当前点移动到指定点 x, y
+            Q x1 y1 x2 y2 二次贝塞尔曲线 从当前点连接一条到点(x2,y2)的二次贝塞尔曲线,并将当前点移动到点(x2,y2),此贝塞尔曲线使用点(x1,y1)作为其控制点。
+            B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线 从当前点连接一条到点(x3,y3)的三次贝塞尔曲线,并将当前点移动到点(x3,y3),此贝塞尔曲线使用点(x1,y1)和点(x2,y2)作为其控制点。
+            A Are 操作数为rx ry angle large sweep x y,从当前点连接一条到点(x,y)的圆弧,并将当前点移动到点(x,y)。
+            其中,rx表示椭圆的长轴长度,ry表示椭圆的短轴长度,angle表示椭圆在当前坐标系下旋转的角度,正值为顺时针,
+            负值为逆时针,large为 1 时表示对应度数大于 180° 的弧,为 0 时表示对应度数小于 180° 的弧,
+            sweep为 1 时表示由圆弧起始点到结束点是顺时针旋转,为 0 时表示由圆弧起始点到结束点是逆时针旋转。
+            C 无操作数,其作用是SubPath自动闭合,表示将当前点和SubPath的起始点用线段直接连接。
+            """
+            relu_list = []
+            mode = ""
+            modes = ["S", "M", "L", "Q", "B", "A", "C"]
+            mode_dict = {}
+            for idx, i in enumerate(Abbr):
+                if i in modes:
+                    mode = i
+                    if mode_dict:
+                        relu_list.append(mode_dict)
+                    mode_dict = {"mode": i, "points": []}
+
+                else:
+                    mode_dict["points"].append(i)
+
+                if idx + 1 == len(Abbr):
+                    relu_list.append(mode_dict)
+            return relu_list
+
+        def assemble(relu_list: list):
+            start_point = {}
+            acticon = []
+
+            for i in relu_list:
+                if i.get("mode") == "M":
+                    if not start_point:
+                        start_point = i
+                    acticon.append({
+                        "start_point": start_point, "end_point": i})
+
+                elif i.get("mode") in ['B', "Q", 'L']:
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+                elif i.get("mode") == "C":
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+                elif i.get("mode") == "A":
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+                elif i.get("mode") == "S":
+                    start_point = i
+
+            return acticon
+
+        def convert_coord(p_list, direction, page_size, pos):
+            """坐标转换ofd2pdf"""
+            new_p_l = []
+            # print("p_list", p_list)
+            for p in p_list:
+                if direction == "x":
+                    new_p = (float(pos[0]) + float(p)) * self.OP
+                else:
+                    new_p = (float(page_size[3]) - float(pos[1]) - float(p)) * self.OP
+                new_p_l.append(new_p)
+            # print("new_p_l", new_p_l)
+            return new_p_l
+
+        for line in line_list:
+            print('one line', "="*20)
+            path = canvas.beginPath()
+            Abbr = line.get("AbbreviatedData").split(" ")  # AbbreviatedData
+            color = line.get("FillColor", [0, 0, 0])
+
+            relu_list = match_mode(Abbr)
+            # TODO 组合 relu_list 1 M L 直线 2 M B*n 三次贝塞尔线 3 M Q*n 二次贝塞尔线
+
+            # print(relu_list)
+
+            acticons = assemble(relu_list)
+            pos = line.get("pos")
+            # print(color)
+            if len(color) < 3:
+                color = [0, 0, 0]
+            canvas.setStrokeColorRGB(*(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255))  # 颜色
+
+            # 设置线条宽度
+            try:
+                LineWidth = (float(line.get("LineWidth", "0.25").replace(" ", "")) if \
+                                 line.get("LineWidth", "0.25").replace(" ", "") else 0.25) * self.OP
+            except Exception as e:
+                logger.error(f"{e}")
+                LineWidth = 0.25 * self.OP
+
+            canvas.setLineWidth(LineWidth)  # 单位为点,2 表示 2 点
+            cur_point = []
+            for acticon in acticons:
+                if acticon.get("end_point").get("mode") == 'M':
+                    x, y = acticon.get("end_point").get("points")
+                    x = convert_coord([x], "x", page_size, pos)[0]
+                    y = convert_coord([y], "y", page_size, pos)[0]
+                    cur_point = [x, y]
+                    path.moveTo(x, y)
+
+                elif acticon.get("end_point").get("mode") == 'L':  # 直线
+                    x, y = acticon.get("end_point").get("points")
+                    print('path L x, y', x, y)
+                    x = convert_coord([x], "x", page_size, pos)[0]
+                    y = convert_coord([y], "y", page_size, pos)[0]
+                    print('path L x, y2', x, y)
+                    path.lineTo(x, y)
+
+
+                elif acticon.get("end_point").get("mode") == 'B':  # 三次贝塞尔线
+                    x1, y1, x2, y2, x3, y3 = acticon.get("end_point").get("points")
+                    # print(x1, y1, x2, y2, x3, y3)
+                    x1, x2, x3 = convert_coord([x1, x2, x3], "x", page_size, pos)
+                    y1, y2, y3 = convert_coord([y1, y2, y3], "y", page_size, pos)
+                    cur_point = [x2, y2]
+                    path.curveTo(x1, y1, x2, y2, x3, y3)
+                    path.moveTo(x3, y3)
+
+                elif acticon.get("end_point").get("mode") == 'Q':  # 二次贝塞尔线
+                    x1, y1, x2, y2 = acticon.get("end_point").get("points")
+                    x1, x2 = convert_coord([x1, x2], "x", page_size, pos)
+                    y1, y2 = convert_coord([y1, y2], "y", page_size, pos)
+                    cur_point = [x2, y2]
+                    path.curveTo(x1, y1, x2, y2, x2, y2)
+                    path.moveTo(x2, y2)
+                elif acticon.get("end_point").get("mode") == 'A':  # 圆弧线
+                    x1, y1 = acticon.get("start_point").get("points")
+                    rx, ry, startAng, large_arc_flag, sweep_flag, x2, y2 = acticon.get("end_point").get("points")
+                    rx_o = rx
+                    ry_o = ry
+
+                    x1, x2, rx = convert_coord([x1, x2, rx], "x", page_size, pos)
+                    y1, y2, ry = convert_coord([y1, y2, ry], "y", page_size, pos)
+
+                    cur_x, cur_y = cur_point
+
+                    # 绘制圆弧 有问题
+                    if rx_o == ry_o:
+                        # path.circle(cur_x,cur_y, 20) # 圆
+                        path.circle(rx, ry, 20)  # 圆 # 莫名其妙的圆
+                    else:
+                        print(rx, ry, x2, y2, startAng, large_arc_flag, sweep_flag)
+                        path.ellipse(rx, ry, 20, 20, )  # 椭圆
+                    # path.arc(rx, ry, x2, y2, startAng=int(startAng), extent=int(sweep_flag))
+                    # path.ellipse(rx, ry,x2, y2, ) # 椭圆
+                    # path.curveTo(rx, ry ,x2, y2, startAng=int(startAng), extent=int(sweep_flag))
+                    path.moveTo(x2, y2)
+                    cur_point = [x2, y2]
+
+                elif acticon.get("end_point").get("mode") == 'C':
+                    # canvas.drawPath(path)
+                    path.close()
+            canvas.drawPath(path)
+
+    def draw_line(self, canvas, line_list, page_size, pdf_page_size):
+        def match_mode(Abbr: list):
+            """
+            解析AbbreviatedData
+            匹配各种线条模式
+            S 定义起始 坐标 x, y
+            M 移动到指定坐标 x, y
+            L 从当前点移动到指定点 x, y
+            Q x1 y1 x2 y2 二次贝塞尔曲线 从当前点连接一条到点(x2,y2)的二次贝塞尔曲线,并将当前点移动到点(x2,y2),此贝塞尔曲线使用点(x1,y1)作为其控制点。
+            B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线 从当前点连接一条到点(x3,y3)的三次贝塞尔曲线,并将当前点移动到点(x3,y3),此贝塞尔曲线使用点(x1,y1)和点(x2,y2)作为其控制点。
+            A Are 操作数为rx ry angle large sweep x y,从当前点连接一条到点(x,y)的圆弧,并将当前点移动到点(x,y)。
+            其中,rx表示椭圆的长轴长度,ry表示椭圆的短轴长度,angle表示椭圆在当前坐标系下旋转的角度,正值为顺时针,
+            负值为逆时针,large为 1 时表示对应度数大于 180° 的弧,为 0 时表示对应度数小于 180° 的弧,
+            sweep为 1 时表示由圆弧起始点到结束点是顺时针旋转,为 0 时表示由圆弧起始点到结束点是逆时针旋转。
+            C 无操作数,其作用是SubPath自动闭合,表示将当前点和SubPath的起始点用线段直接连接。
+            """
+            relu_list = []
+            mode = ""
+            modes = ["S", "M", "L", "Q", "B", "A", "C"]
+            mode_dict = {}
+            for idx, i in enumerate(Abbr):
+                if i in modes:
+                    mode = i
+                    if mode_dict:
+                        relu_list.append(mode_dict)
+                    mode_dict = {"mode": i, "points": []}
+
+                else:
+                    mode_dict["points"].append(i)
+
+                if idx + 1 == len(Abbr):
+                    relu_list.append(mode_dict)
+            return relu_list
+
+        def assemble(relu_list: list):
+            start_point = {}
+            acticon = []
+
+            for i in relu_list:
+                if i.get("mode") == "M":
+                    if not start_point:
+                        start_point = i
+                    acticon.append({
+                        "start_point": start_point, "end_point": i})
+
+                elif i.get("mode") in ['B', "Q", 'L']:
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+                elif i.get("mode") == "C":
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+                elif i.get("mode") == "A":
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+                elif i.get("mode") == "S":
+                    start_point = i
+
+            return acticon
+
+        for line in line_list:
+            # print('one line', "="*20)
+            path = canvas.beginPath()
+            abbr = line.get("AbbreviatedData").split(" ")
+            color = line.get("FillColor", [0, 0, 0])
+
+            # 线条解析
+            relu_list = match_mode(abbr)
+            actions = assemble(relu_list)
+
+            # 变换矩阵
+            ctm = line.get("CTM", '')
+            ctm = self.get_ctm(ctm)
+
+            # 文本框范围
+            boundary = line.get("pos")
+            if len(boundary) != 4:
+                print('draw_line not boundary', boundary)
+                return
+
+            # 设置颜色
+            if len(color) < 3:
+                color = [0, 0, 0]
+            canvas.setStrokeColorRGB(*(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255))  # 颜色
+
+            # 设置线条宽度
+            line_w = 0.20 * self.OP
+            canvas.setLineWidth(line_w)
+
+            for action in actions:
+                if action.get("end_point").get("mode") == 'M':
+                    x, y = action.get("end_point").get("points")
+                    # print('path M x, y', x, y)
+                    x, y = self.get_actural_p(x, y, ctm, boundary)
+                    x = x * self.OP
+                    y = pdf_page_size[3] - y * self.OP
+                    # print('path M x, y2', x, y)
+                    path.moveTo(x, y)
+
+                elif action.get("end_point").get("mode") == 'L':  # 直线
+                    x, y = action.get("end_point").get("points")
+                    # print('path L x, y', x, y)
+                    x, y = self.get_actural_p(x, y, ctm, boundary)
+                    # print('path L x, y1', x, y)
+                    x = x * self.OP
+                    y = pdf_page_size[3] - y * self.OP
+                    # print('path L x, y2', x, y)
+                    path.lineTo(x, y)
+
+                elif action.get("end_point").get("mode") == 'C':
+                    path.close()
+            canvas.drawPath(path)
+
+    def get_actural_p(self, x, y, ctm, boundary):
+        x, y = float(x), float(y)
+        a, b, c, d, e, f = ctm
+        left, bottom, width, height = boundary
+        # print('left, x, a', left, x, a, type(left), type(x), type(a))
+        x = left + x * a
+        y2 = bottom + y * d
+        y1 = y2 + height
+        return x, y2
+
+    def draw_pdf(self):
+        c = canvas.Canvas(self.pdf_io)
+        c.setAuthor(self.author)
+        page_need_to_image_dict = {}
+        for doc_id, doc in enumerate(self.data, start=0):
+            # print(1)
+            fonts = doc.get("fonts")
+            images = doc.get("images")
+            default_page_size = doc.get("default_page_size")
+            page_size_details = doc.get("page_size")
+            # print("page_size_details", page_size_details)
+            signatures_page_id = doc.get("signatures_page_id")  # 签证信息
+            # annot_page_info = doc.get("annot_page_info")
+
+            # 注册字体
+            # for font_id, font_v in fonts.items():
+            #     file_name = font_v.get("FontFile")
+            #     font_b64 = font_v.get("font_b64")
+            #     if font_b64:
+            #         self.font_tool.register_font(os.path.split(file_name)[1], font_v.get("@FontName"), font_b64)
+
+            # 判断页数是否匹配
+            if len(doc.get("page_info")) != len(page_size_details):
+                print('len(doc.get("page_info")) != len(page_size_details)')
+                continue
+
+            page_id_list = list(doc.get("page_info").keys())
+            try:
+                page_id_list.sort(key=lambda x: int(x))
+            except:
+                traceback.print_exc()
+                print('sort page_id_list error!', page_id_list)
+                continue
+
+            # text_img_idwrite = []
+            # print("doc.get(page_info)", len(doc.get("page_info")))
+            for pi, page_id in enumerate(page_id_list):
+                page = doc.get("page_info").get(page_id)
+                annot_text_list = doc.get("page_info").get(page_id).get('annot_text_list')
+                # print('page111', page)
+                # print(f"page_id: {page_id} page_size_details: {page_size_details}")
+                # if len(page_size_details) > page_id and page_size_details[page_id]:
+                #     page_size = page_size_details[page_id]
+                # else:
+                #     page_size = default_page_size
+                page_size = page_size_details[pi]
+                # logger.info(f"page_id {page_id} page_size {page_size}")
+                text_list = page.get("text_list")
+                img_list = page.get("img_list")
+                line_list = page.get("line_list")
+                # print("img_list",img_list)
+                # print('page_size222', page_size)
+                c.setPageSize((page_size[2] * self.OP, page_size[3] * self.OP))
+                pdf_page_size = [x * self.OP for x in page_size]
+
+                # print('len(img_list), len(images), len(text_list), len(line_list)', len(img_list), len(images), len(text_list), len(line_list))
+
+                # 写入图片
+                # print('annot_text_list', annot_text_list)
+                # if img_list and annot_text_list:
+                #     annot_page_size = doc.get("page_info").get(page_id).get('annot_page_size')
+                #     print('annot_page_size111', annot_page_size)
+                #     self.draw_img_with_annot(c, img_list, images, annot_page_size, pdf_page_size, self.OP, annot_text_list)
+
+                if img_list and annot_text_list:
+                    page_need_to_image_dict[pi] = True
+                else:
+                    page_need_to_image_dict[pi] = False
+                if img_list:
+                    self.draw_img(c, img_list, images, page_size, pdf_page_size, self.OP)
+
+                # 写入文本
+                if text_list:
+                    # 特殊中文转为基本中文
+                    for line_dict in text_list:
+                        text = line_dict.get("text")
+                        line_dict['text'] = special_font_to_normal(text)
+                        # print('draw_chars, text', text, line_dict.get('pos'))
+                    self.draw_chars(c, text_list, fonts, page_size, pdf_page_size)
+
+                # 绘制线条
+                if line_list:
+                    # for line in line_list:
+                    #     print('line', line)
+                    self.draw_line(c, line_list, page_size, pdf_page_size)
+
+                # 绘制签章
+                # if signatures_page_id:
+                #     self.draw_signature(c, signatures_page_id.get(page_id), page_size)
+
+                # print("去写入")
+                # print(doc_id,len(self.data))
+                # 页码判断逻辑
+                # if page_id != len(doc.get("page_info")) - 1 and doc_id != len(self.data):
+                #     # print("写入")
+                #     c.showPage()
+                    # json.dump(text_write,open("text_write.json","w",encoding="utf-8"),ensure_ascii=False)
+                c.showPage()
+        c.save()
+        return page_need_to_image_dict
+
+    def __call__(self):
+        try:
+            page_need_to_image_dict = self.draw_pdf()
+            self.page_need_to_image_dict = page_need_to_image_dict
+            pdfbytes = self.pdf_io.getvalue()
+        except Exception as e:
+            logger.error(f"{e}")
+            logger.error(f"ofd解析失败")
+            traceback.print_exc()
+            self.gen_empty_pdf()
+            self.page_need_to_image_dict = {}
+            pdfbytes = self.pdf_io.getvalue()
+        return pdfbytes
+
+
+

+ 113 - 0
format_convert/easyofd/easyofd/draw/find_seal_img.py

@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: easyofd read_seal_img
+# CREATE_TIME: 2024/5/28 14:13
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: renoyuan
+# note: 根据 ASN.1 解析签章 拿到 签章图片
+import io
+import base64
+
+from PIL import Image, UnidentifiedImageError
+from loguru import logger
+from pyasn1.codec.der.decoder import decode
+from pyasn1.type import univ
+from pyasn1.error import PyAsn1Error
+
+
+
+class SealExtract(object):
+    def __init__(self,):
+        pass
+    def read_signed_value(self, path="", b64=""):
+        # 读取二进制文件
+        if b64:
+            binary_data = base64.b64decode(b64)
+        elif path:
+            # print("seal_path",path)
+            with open(path, 'rb') as file:
+                binary_data = file.read()
+        else:
+            return
+        # 尝试解码为通用的 ASN.1 结构
+        try:
+            decoded_data, _ = decode(binary_data)
+        except (PyAsn1Error,) as e:
+            logger.warning(f"Decoding failed: {e}")
+            decoded_data = None
+        except (AttributeError,) as e:
+            logger.warning(f"AttributeError failed: {e}")
+            decoded_data = None
+        finally:
+           return  decoded_data
+
+
+    def find_octet_strings(self, asn1_data,octet_strings:list):
+
+        # 递归查找所有的 OctetString 实例
+
+        if isinstance(asn1_data, univ.OctetString):
+
+            octet_strings.append(asn1_data)
+        elif isinstance(asn1_data, univ.Sequence) or isinstance(asn1_data, univ.Set):
+            for component in asn1_data:
+                self.find_octet_strings(asn1_data[f"{component}"], octet_strings)
+        elif isinstance(asn1_data, univ.Choice):
+            self.find_octet_strings(asn1_data.getComponent(), octet_strings)
+        elif isinstance(asn1_data, univ.Any):
+            try:
+                sub_data, _ = decode(asn1_data.asOctets())
+                self.find_octet_strings(sub_data, octet_strings)
+            except PyAsn1Error:
+                pass
+
+
+    def hex_to_image(self, hex_data, image_format='PNG',inx=0):
+        """
+        将16进制数据转换为图片并保存。
+
+        :param hex_data: 图片的16进制数据字符串
+        :param image_format: 图片的格式,默认为'PNG'
+        """
+        # 将16进制数据转换为二进制数据
+
+        binary_data = bytes.fromhex(hex_data)
+
+        # 创建BytesIO对象以读取二进制数据
+        image_stream = io.BytesIO(binary_data)
+
+        # 使用Pillow打开图像数据并保存
+        try:
+            image = Image.open(image_stream)
+            # image.save(f'{inx}_image.{image_format}', format=image_format)
+            # print(f"图片已保存为'image.{image_format}'")
+            return image
+        except UnidentifiedImageError:
+            # logger.info("not img ")
+            pass
+
+    def __call__(self, path="", b64=""):
+
+        decoded_data = self.read_signed_value(path=path, b64=b64)
+        octet_strings = []
+        img_list = []  # 目前是只有一个的,若存在多个的话关联后面考虑
+        if decoded_data:
+            self.find_octet_strings(decoded_data, octet_strings)
+
+            for i, octet_string in enumerate(octet_strings):
+                # logger.info(f"octet_string{octet_string}")
+                if str(octet_string.prettyPrint()).startswith("0x"):
+
+                    img = self.hex_to_image(str(octet_string.prettyPrint())[2:],inx= i)
+                    if img:
+                        # logger.info("ASN.1 data found.")
+                        img_list.append(img)
+        else:
+            pass
+            # logger.info("No valid ASN.1 data found.")
+
+        return  img_list
+
+if __name__=="__main__":
+    print(SealExtract()(r"F:\code\easyofd\test\1111_xml\Doc_0\Signs\Sign_0\SignedValue.dat" ))
+

+ 216 - 0
format_convert/easyofd/easyofd/draw/font_tools.py

@@ -0,0 +1,216 @@
+#!/usr/bin/env python
+#-*- coding: utf-8 -*-
+#PROJECT_NAME: D:\code\easyofd\easyofd
+#CREATE_TIME: 2023-07-27 
+#E_MAIL: renoyuan@foxmail.com
+#AUTHOR: reno 
+#NOTE: 字体处理
+import sys
+import time
+import re
+import json
+import base64
+import zipfile
+import os
+import shutil
+import logging
+from io import BytesIO, StringIO
+import string
+from uuid import uuid1
+import random
+import traceback
+import logging
+
+
+import tempfile
+import xmltodict
+from fontTools.ttLib import TTFont as ttLib_TTFont
+from fontTools.pens.basePen import BasePen
+from reportlab.graphics.shapes import Path
+from reportlab.lib import colors
+from reportlab.graphics import renderPM
+from reportlab.graphics.shapes import Group, Drawing, scale
+from reportlab import platypus
+from reportlab.lib.pagesizes import letter, A4
+from reportlab.lib.units import mm,inch
+from reportlab.platypus import SimpleDocTemplate, Image
+from reportlab.lib.utils import ImageReader
+from reportlab.pdfgen import canvas
+from reportlab.pdfbase import pdfmetrics
+from reportlab.pdfbase.cidfonts import UnicodeCIDFont
+from reportlab.pdfbase.ttfonts import TTFont
+from concurrent.futures import ThreadPoolExecutor
+import threading
+import multiprocessing
+import PIL
+
+
+from reportlab.lib.fonts import _tt2ps_map 
+from reportlab.lib.fonts import _family_alias
+
+
+sys.path.append(os.path.dirname(__file__) + "/../../../../")
+
+from format_convert.easyofd.easyofd.draw import FONTS
+
+from loguru import logger
+
+
+
+class FontTool(object):
+    FONTS = FONTS
+    def __init__(self):
+        # 初始支持字体
+        # 字体检测
+        # logger.debug("FontTool init ,read system default Font ... ")
+        self.FONTS = self.get_installed_fonts()
+        # logger.debug(f"system default Font is \n{self.FONTS} \n{'-'*50}")
+
+
+    def get_system_font_dirs(self,):
+        """获取不同操作系统的字体目录"""
+        system = os.name
+        if system == 'nt':  # Windows
+            return [os.path.join(os.environ['WINDIR'], 'Fonts')]
+        elif system == 'posix':  # Linux/macOS
+            return [
+                '/usr/share/fonts',
+                '/usr/local/share/fonts',
+                os.path.expanduser('~/.fonts'),
+                os.path.expanduser('~/.local/share/fonts'),
+                '/Library/Fonts',  # macOS
+                '/System/Library/Fonts'  # macOS
+            ]
+        else:
+            return []
+
+    def normalize_font_name(self, font_name):
+        """将字体名称规范化,例如 'Times New Roman Bold' -> 'TimesNewRoman-Bold'"""
+        # 替换空格为无,并将样式(Bold/Italic等)用连字符连接
+        normalized = font_name.replace(' ', '')
+        # 处理常见的样式后缀
+        for style in ['Bold', 'Italic', 'Regular', 'Light', 'Medium', ]:
+            if style in normalized:
+                normalized = normalized.replace(style, f'-{style}')
+
+        # todo 特殊字体名规范 后续存在需要完善
+        if normalized ==  "TimesNewRoman" :
+            normalized = normalized.replace("TimesNewRoman","Times-Roman")
+        return normalized
+
+    def _process_ttc_font(self, ttc_font):
+        """处理ttc文件中的所有字体"""
+        def judge_name(name):
+            if 'http://' in name or 'https://' in name or len(name) > 50:
+                return False
+            else:
+                return True
+        font_names = set()
+        try:
+            # 获取所有可用的名称记录
+            name_records = ttc_font['name'].names
+
+            for idx, record in enumerate(name_records):
+                try:
+                    # 尝试获取中文名称(简体中文的language ID是2052)
+                    if record.platformID == 3 and record.langID == 2052:
+                        cn_name = record.toUnicode()
+                        if judge_name(cn_name):
+                            font_names.add(cn_name)
+
+
+
+                    # 回退到英文名称(language ID 1033)
+                    elif record.platformID == 3 and record.langID == 1033:
+                        name = record.toUnicode()
+                        if judge_name(name):
+                            font_names.add(name)
+                except:
+                    continue
+        except KeyError:
+            # 如果name表不存在,跳过
+            pass
+        return font_names
+    def get_installed_fonts(self, ):
+        """获取所有已安装字体的名称和家族"""
+        font_dirs = self.get_system_font_dirs()
+        installed_fonts = set()
+        for font_dir in font_dirs:
+            if not os.path.isdir(font_dir):
+                continue
+            for root, _, files in os.walk(font_dir):
+                for file in files:
+                    if file.lower().endswith(('.ttf', '.otf','.ttc')):
+                        font_path = os.path.join(root, file)
+
+                        try:
+                            if file.lower().endswith('.ttc'):
+                                # 对于ttc文件,读取所有字体
+                                ttc_font = ttLib_TTFont(font_path, fontNumber=0)  # 读取第一个字体
+                                installed_fonts.update(self._process_ttc_font(ttc_font))
+                            else:
+                                with ttLib_TTFont(font_path) as font:
+                                    # 提取字体全名和家族名
+                                    name_cn = font['name'].getName(4, 3, 1, 2052)
+                                    if name_cn:
+                                        installed_fonts.add(name_cn.toUnicode())
+                                    # 4=Full Name, 3=Windows, 1=Unicode
+                                    name = font['name'].getName(4, 3, 1, 1033)
+                                    if name:
+                                        installed_fonts.add(name.toUnicode())
+                                    family_cn = font['name'].getName(1, 3, 1, 2052)
+                                    if family_cn:
+                                        installed_fonts.add(family_cn.toUnicode())
+                                    family = font['name'].getName(1, 3, 1, 1033)
+                                    if family:  # 1=Family Name
+                                        installed_fonts.add(family.toUnicode())
+                        except Exception as e:
+                            print(f"解析字体 {font_path} 失败: {e}")
+        installed_fonts = list(installed_fonts)
+        if "宋体" in installed_fonts:
+            installed_fonts.remove("宋体")
+            installed_fonts.insert(0, "宋体")
+        return installed_fonts
+
+    def is_font_available(self, target_font):
+        """检查目标字体是否安装"""
+        installed_fonts = self.get_installed_fonts()
+        return target_font in installed_fonts
+
+    
+    def font_check(self):
+        pass
+        # logger.info("f{_tt2ps_map}")
+        # logger.info("f{_family_alias}")
+        
+        # for font in self.FONTS:
+        #     if font in _tt2ps_map.values():
+        #         logger.info(f"已注册{font}")
+        #     else:
+        #         logger.warning(f"-{font}-未注册可能导致写入失败")
+                 
+        
+        
+    def register_font(self,file_name,FontName,font_b64):
+        
+        if font_b64:
+            
+            file_name = os.path.split(file_name)
+            # logger.error(f"file_name:{file_name}")
+            # logger.info(f"file_name:{file_name}")
+            if isinstance(file_name, (tuple, list)):
+                    file_name = file_name[1]
+            if not FontName:
+                FontName = file_name.split(".")[0]
+
+            try:
+                with open(file_name, "wb") as f:
+                    f.write(base64.b64decode(font_b64))
+                # print("FontName", FontName, "file_name", file_name)
+                pdfmetrics.registerFont(TTFont(FontName, file_name))
+                self.FONTS.append(FontName)
+            except Exception as e:
+                logger.error(f"register_font_error:\n{e} \n 包含不支持解析字体格式")
+            finally:
+                if os.path.exists(file_name):
+                    os.remove(file_name)

+ 666 - 0
format_convert/easyofd/easyofd/draw/ofdtemplate.py

@@ -0,0 +1,666 @@
+#!/usr/bin/env python
+#-*- coding: utf-8 -*-
+#PROJECT_NAME: F:\code\easyofd\easyofd\draw
+#CREATE_TIME: 2023-10-30 
+#E_MAIL: renoyuan@foxmail.com
+#AUTHOR: reno 
+#note:  ofd 基础结构模板
+import tempfile
+import os
+import abc
+import copy
+
+from loguru import logger
+import xmltodict
+import zipfile
+
+__all__ = ["CurId", "OFDTemplate", "DocumentTemplate", "DocumentResTemplate",
+           "PublicResTemplate", "ContentTemplate", "OFDStructure"]
+"""
+OFD目录结构
+    │  OFD.xml
+    │  
+    └─Doc_0
+        │  Document.xml
+        │  DocumentRes.xml
+        │  PublicRes.xml
+        │  
+        ├─Annots
+        │  │  Annotations.xml
+        │  │  
+        │  └─Page_0
+        │          Annotation.xml
+        │          
+        ├─Attachs
+        │      Attachments.xml
+        │      original_invoice.xml
+        │      
+        ├─Pages
+        │  └─Page_0
+        │          Content.xml
+        │          
+        ├─Res
+        │      image_80.jb2
+        │      
+        ├─Signs
+        │  │  Signatures.xml
+        │  │  
+        │  └─Sign_0
+        │          Signature.xml
+        │          SignedValue.dat
+        │          
+        ├─Tags
+        │      CustomTag.xml
+        │      CustomTags.xml
+        │      
+        └─Tpls
+            └─Tpl_0
+                    Content.xml
+"""
+class CurId(object):
+    """文档内id控制对象"""
+    def __init__(self):
+        self.id = 1
+        self.used = False
+        self.uuid_map = {} # 资源文件生成id的时候手动添加进来后面构建page 可以 匹配ResourceID
+
+    def add_uuid_map(self, k, v):
+        # logger.debug(f"uuid_map add {k}: {v}")
+        self.uuid_map[k] = v
+    def add(self):
+        self.id += 1
+
+    def get_id(self):
+        if self.used:
+            self.add()
+            return self.id
+        if not self.used:
+            cur_id = self.id
+            self.used =True
+            return cur_id
+
+    def get_max_id(self):
+        MaxUnitID = self.id + 1
+        return MaxUnitID
+
+class TemplateBase(object):
+    """模板基类"""
+    key_map = {}  # 变量名对应 xml 中形式 映射 如 传入   DocID -> ofd:DocID
+    id_keys = [ ]  # 对需要的要素添加 "@ID"
+    template_name = ""
+    def __init__(self,*args,**kwargs):
+        # print(args)
+        # print(kwargs)
+        self.id_obj: CurId = kwargs.get("id_obj")
+        # print("id_obj", self.id_obj)
+        self.assemble(*args, **kwargs)
+
+
+    def assemble(self,*args, **kwargs):
+        """对ofdjson组装"""
+
+        self.final_json = copy.deepcopy(self.ofdjson)
+
+        # 往模板里面添加要素值
+        if kwargs:
+            for k, v in kwargs.items():
+                if k in self.key_map:
+                    self.modify(self.final_json, self.key_map[k], v)
+
+        # 添加id
+        for id_key in self.id_keys:
+            print(f"开始gen_id >> {self.template_name}>>{id_key}")
+            # print(f"final_json {self.final_json}")
+            self.gen_id(self.final_json, id_key)
+
+    def gen_id(self,ofdjson, id_key):
+        """生成id"""
+        # print("id_key ", id_key, "ofdjson ", ofdjson)
+
+        for k, v in ofdjson.items():
+            if k == id_key:
+                # 添加id
+                if isinstance(ofdjson[k], dict):
+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
+
+                    # logger.info(f"添加id -> {ofdjson[k]}")
+                elif isinstance(ofdjson[k], list):
+                    for i in ofdjson[k]:
+                        i["@ID"] = f"{self.id_obj.get_id()}"
+
+                        # logger.info(f"添加id ->i {i}")
+
+            elif isinstance(v, dict):
+                # logger.debug(f"dict_v{v}")
+                self.gen_id(v, id_key)
+
+
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        # logger.debug(f"dict_v{v}")
+                        self.gen_id(v_cell, id_key)
+
+                    
+    def modify(self, ofdjson, key, value):
+        """对指定key的值更改  多个会统一改"""
+        
+        for k, v in ofdjson.items():
+            if k == key:
+                ofdjson[k] = value
+            elif isinstance(v, dict):
+                self.modify(v, key, value)
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        self.modify(v_cell, key, value)
+    
+    def save(self, path):
+        xml_data = xmltodict.unparse(self.final_json, pretty=True)
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(xml_data)
+
+class OFDTemplate(TemplateBase):
+    """根节点全局唯一 OFD.xml"""
+    template_name = "OFD"
+    key_map = {"Author": "ofd:Author", "DocID": "ofd:DocID"  ,"CreationDate": "ofd:CreationDate"
+    }
+
+    ofdjson = {
+
+        "ofd:OFD": {
+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+            "@Version": "1.1",
+            "@DocType": "OFD",
+            "ofd:DocBody": [{
+                "ofd:DocInfo": {
+                    "ofd:DocID": "0C1D4F7159954EEEDE517F7285E84DC4",
+                    "ofd:Creator": "easyofd",
+                    "ofd:author": "renoyuan",
+                    "ofd:authoremail": "renoyuan@foxmail.com",
+                    "ofd:CreatorVersion": "1.0",
+                    "ofd:CreationDate": "2023-10-27"
+                },
+                "ofd:DocRoot": "Doc_0/Document.xml"
+            }]
+        }
+    }
+
+class DocumentTemplate(TemplateBase):
+    """DOC 内唯一 表示DOC内部结构 Document.xml
+
+    """
+    template_name = "Document"
+    key_map = {"Page": "ofd:Page","PhysicalBox":"ofd:PhysicalBox"}
+    id_keys = ["ofd:Page"]
+    ofdjson ={
+    "ofd:Document": {
+        "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+        "ofd:CommonData": {
+            "ofd:MaxUnitID": 0,
+            "ofd:PageArea": {
+                "ofd:PhysicalBox": "0 0 140 90"
+            },
+            "ofd:PublicRes": "PublicRes.xml",
+            "ofd:DocumentRes": "DocumentRes.xml"
+        },
+        "ofd:Pages":
+            {
+            "ofd:Page": [{
+                "@ID": 0,
+                "@BaseLoc": "Pages/Page_0/Content.xml"
+            }]
+        }
+    }
+}
+
+    def update_max_unit_id(self, final_json=None):
+        if not final_json:
+            final_json = self.final_json
+
+        for k, v in final_json.items():
+            if k == "ofd:MaxUnitID":
+                final_json["ofd:MaxUnitID"]=self.id_obj.get_max_id()
+                return
+
+            elif isinstance(v, dict):
+                self.update_max_unit_id(v)
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        self.update_max_unit_id(v_cell)
+
+    def update_page(self,page_num):
+        pass
+
+class DocumentResTemplate(TemplateBase):
+    """DOC 内唯一 表示MultyMedia 资源信息 如 图片 DocumentRes.xml """
+    template_name = "DocumentRes"
+    key_map = {"MultiMedia": "ofd:MultiMedia"}
+    id_keys = ["ofd:DrawParam", "ofd:MultiMedia"]
+    ofdjson = {
+        "ofd:Res": {
+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+            "@BaseLoc": "Res",
+            "ofd:MultiMedias": {
+                "ofd:MultiMedia": [
+                    {
+                        "@ID": 0,
+                        "@Type": "Image",
+                        "ofd:MediaFile": "Image_2.jpg"
+                    }
+                ]
+            }
+        }
+    }
+    def gen_id(self,ofdjson, id_key):
+        """生成id"""
+        # print("id_key ", id_key, "ofdjson ", ofdjson)
+
+        for k, v in ofdjson.items():
+            if k == id_key:
+                # 添加id
+                if isinstance(ofdjson[k], dict):
+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
+
+                    res_uuid = ofdjson[k].get("res_uuid")
+                    if res_uuid:
+                        self.id_obj.add_uuid_map(res_uuid, ofdjson[k]["@ID"])
+                    # logger.info(f"添加id -> {ofdjson[k]}")
+                elif isinstance(ofdjson[k], list):
+                    for i in ofdjson[k]:
+
+                        i["@ID"] = f"{self.id_obj.get_id()}"
+                        res_uuid = i.get("res_uuid")
+                        if res_uuid:
+                            self.id_obj.add_uuid_map(res_uuid, i["@ID"])
+                        # logger.info(f"添加id ->i {i}")
+
+            elif isinstance(v, dict):
+                # logger.debug(f"dict_v{v}")
+                self.gen_id(v, id_key)
+
+
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        # logger.debug(f"dict_v{v}")
+                        self.gen_id(v_cell, id_key)
+
+class PublicResTemplate(TemplateBase):
+    """DOC 内唯一 公共配置资源信息 如 Font  Color 等 PublicRes.xml"""
+    template_name = "PulicRes"
+    key_map = {"Font": "ofd:Font"}
+    id_keys = ["ofd:ColorSpace", "ofd:Font"]
+    ofdjson = {
+        "ofd:Res": {
+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+            "@BaseLoc": "Res",
+            "ofd:ColorSpaces": {
+                "ofd:ColorSpace": {
+                    "@ID": 0,
+                    "@Type": "RGB",
+                    "@BitsPerComponent": "8",
+                    "#text":""
+                }
+            },
+            "ofd:Fonts": {
+                "ofd:Font": [
+                {
+                    "@ID": 0,
+                    "@FontName": "宋体",
+                    "@FamilyName": "宋体",
+
+                }
+            ]
+            }
+        }
+    }
+    def gen_id(self,ofdjson, id_key):
+        """生成id"""
+        # print("id_key ", id_key, "ofdjson ", ofdjson)
+
+        for k, v in ofdjson.items():
+            if k == id_key:
+                # 添加id
+                if isinstance(ofdjson[k], dict):
+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
+                    res_uuid = ofdjson[k].get("res_uuid")
+                    if res_uuid:
+                        self.id_obj.add_uuid_map(res_uuid, ofdjson[k]["@ID"])
+                    # logger.info(f"添加id -> {ofdjson[k]}")
+                elif isinstance(ofdjson[k], list):
+                    for i in ofdjson[k]:
+
+                        i["@ID"] = f"{self.id_obj.get_id()}"
+                        res_uuid = i.get("res_uuid")
+                        if res_uuid:
+                            self.id_obj.add_uuid_map(res_uuid, i["@ID"])
+                        # logger.info(f"添加id ->i {i}")
+
+            elif isinstance(v, dict):
+                # logger.debug(f"dict_v{v}")
+                self.gen_id(v, id_key)
+
+
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        # logger.debug(f"dict_v{v}")
+                        self.gen_id(v_cell, id_key)
+
+'''
+    "ofd:Font": [
+
+    {
+        "@ID": 0,
+        "@FontName": "STSong",
+        "@FamilyName": "SimSun",
+        "@Serif": "true",
+        "@FixedWidth": "true",
+        "@Charset": "prc"
+    }
+            "ofd:Area": {
+            "ofd:PhysicalBox": "0 0 210 140"
+        },
+'''
+
+
+class ContentTemplate(TemplateBase):
+    """正文部分 Content.xml"""
+    #"@Type": "Body",
+    template_name = "Content"
+    key_map = {"ImageObject": "ofd:ImageObject",
+               "PathObject": "ofd:PathObject",
+               "TextObject": "ofd:TextObject",
+               "CGTransform": "ofd:CGTransform",
+               "PhysicalBox": "ofd:PhysicalBox",
+               }
+    id_keys = ["ofd:Layer", "ofd:TextObject", "ofd:PathObject", "ofd:Clips", "ofd:ImageObject"]
+    correlate_map = {"ofd:TextObject": "@Font",
+                     "ofd:ImageObject": "@ResourceID"
+
+                     }
+
+    ofdjson = {
+    "ofd:Page": {
+        "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+
+        "ofd:Content": {
+            "ofd:PageArea": {
+                "ofd:PhysicalBox": "0 0 210 140"
+            },
+            "ofd:Layer":  {
+                "@ID": 0,
+                "@Type": "Foreground",
+
+
+                "ofd:TextObject": [{
+                        "@ID": 0,
+                        "@CTM": "7.054 0 0 7.054 0 134.026",
+                        "@Boundary": "69 7 72 7.6749",
+                        "@Font": "69",
+                        "@Size": "6.7028",
+                        "ofd:FillColor": {
+                            "@ColorSpace": "4",
+                            "@Value": "156 82 35"
+                        },
+                        "ofd:CGTransform": {
+                            "@CodePosition": "0",
+                            "@CodeCount": "10",
+                            "@GlyphCount": "10",
+                            "ofd:Glyphs": "18 10 11 42 60 53 24 11 42 61"
+                        },
+                        "ofd:TextCode": {
+                            "@X": "13.925",
+                            "@Y": "10",
+                            "@DeltaX": "7 7 7 7 7 7 7 7 7",
+                            "#text": "电⼦发票(普通发票)"
+                        }
+                    }],
+                "ofd:ImageObject": []
+                }
+        }}}
+    def __init__(self,*args,**kwargs):
+        # print(args)
+        # print(kwargs)
+        super().__init__(*args, **kwargs)
+        # 关联res_uuid
+        for key, targe_key in self.correlate_map.items():
+            self.correlate_res_uuid(self.final_json,key,targe_key)
+
+    def correlate_res_uuid(self, ofdjson,key,targe_key):
+        """correlate_res_uuid"""
+        print("========uuid_map", self.id_obj.uuid_map)
+        for k, v in ofdjson.items():
+            if k == key:
+                res_uuid = v_cell.pop("res_uuid", None)
+                if isinstance(v, dict) and res_uuid:
+
+                    v[targe_key] = self.id_obj.uuid_map[res_uuid]
+                    # logger.debug(f'{targe_key} >>> {v[targe_key]} -- {res_uuid}')
+                elif isinstance(v, list):
+                    for v_cell in v:
+                        res_uuid = None
+                        if isinstance(v_cell, dict):
+                            res_uuid = v_cell.pop("res_uuid", None)
+                        if isinstance(v_cell, dict) and res_uuid:
+
+                            v_cell[targe_key] = self.id_obj.uuid_map[res_uuid]
+                            # logger.debug(f'{targe_key} >>> {v_cell[targe_key]} -- {res_uuid}')
+                        else:
+                            pass
+                            # print(f"v_cell {v_cell}")
+                    pass
+                else:
+                    pass
+            elif isinstance(v, dict):
+                self.correlate_res_uuid(v, key, targe_key)
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        self.correlate_res_uuid(v_cell, key, targe_key)
+
+
+'''
+                "ofd:PathObject": [{
+                        "@ID": 0,
+                        "@CTM": "0.3527 0 0 -0.3527 0.35 141.43001",
+                        "@Boundary": "-0.35 -0.35 212.33 141.78999",
+                        "@LineWidth": "1",
+                        "@MiterLimit": "10",
+                        "@Stroke": "false",
+                        "@Fill": "true",
+                        "ofd:FillColor": {
+                            "@ColorSpace": "4",
+                            "@Value": "255 255 255"
+                        },
+                        "ofd:StrokeColor": {
+                            "@ColorSpace": "4",
+                            "@Value": "0 0 0"
+                        },
+                        "ofd:Clips": {
+                            "ofd:Clip": {
+                                "ofd:Area": {
+                                    "ofd:Path": {
+                                        "@ID": 0,
+                                        "@Boundary": "0.00766 -0.00763 600 400.00003",
+                                        "@Stroke": "false",
+                                        "@Fill": "true",
+                                        "ofd:AbbreviatedData": "M 0 0 L 600 0 L 600 400.00003 L 0 400.00003 C"
+                                    }
+                                }
+                            }
+                        },
+                        "ofd:AbbreviatedData": "M -1 401 L 601 401 L 601 -1 L -1 -1 C"
+                    },],
+                
+"ofd:ImageObject": [{
+                        "@ID": 0,
+                        "@CTM": "19.7512 0 0 19.7512 0 0",
+                        "@Boundary": "7.23035 7.40671 19.7512 19.7512",
+                        "@ResourceID": "104"
+                    }],
+'''
+
+class OFDStructure(object):
+    """OFD structure"""
+    def __init__(self, name, ofd=None, document=None,
+                 document_res=None, public_res=None,
+                  content_res:list=[], res_static: dict={}):
+        # 初始化的时候会先自动初始化 默认参数值
+        id_obj = CurId()
+        self.name = name
+        self.ofd = ofd if ofd else OFDTemplate(id_obj=id_obj)
+        self.document = document if document else DocumentTemplate(id_obj=id_obj)
+        self.document_res = document_res if document_res else  DocumentResTemplate(id_obj=id_obj)
+        self.public_res = public_res if public_res else PublicResTemplate(id_obj=id_obj)
+        self.content_res = content_res if content_res else [ContentTemplate(id_obj=id_obj)]
+        self.res_static = res_static
+       
+    def __call__(self, test=False):
+        """写入文件生成ofd"""
+        with tempfile.TemporaryDirectory() as t_dir:
+            if test:
+                temp_dir = r"./test"
+                os.mkdir(temp_dir)
+            else:
+                temp_dir = t_dir
+            # 创建过程目录
+            temp_dir_doc_0 = os.path.join(temp_dir, 'Doc_0')
+            temp_dir_pages = os.path.join(temp_dir, 'Doc_0', "Pages")
+            temp_dir_res = os.path.join(temp_dir, 'Doc_0', "Res")  # 静态资源路径
+            for i in [temp_dir_doc_0, temp_dir_pages, temp_dir_res]:
+                # print(i)
+                os.mkdir(i)
+
+            # 写入 OFD
+            self.ofd.save(os.path.join(temp_dir, 'OFD.xml'))
+
+            # 更新 max_unit_id & 写入 Document
+            self.document.update_max_unit_id()
+            self.document.save(os.path.join(temp_dir_doc_0, 'Document.xml'))
+
+            # 写入 DocumentRes
+            self.document_res.save(os.path.join(temp_dir_doc_0, 'DocumentRes.xml'))
+
+            # 写入 PublicRes
+            self.public_res.save(os.path.join(temp_dir_doc_0, 'PublicRes.xml'))
+
+            # 写入 content_res
+            for idx, page in enumerate(self.content_res):
+                temp_dir_pages_idx = os.path.join(temp_dir_pages, f"Page_{idx}")
+                os.mkdir(temp_dir_pages_idx)
+                # os.mkdir(i)
+                page.save(os.path.join(temp_dir_pages_idx, 'Content.xml'))
+
+            # 写入静态资源
+            for k, v in self.res_static.items():
+                  with open(os.path.join(temp_dir_res, k), "wb") as f:
+                      f.write(v)
+
+            # 打包成ofd
+            zip = zipfile.ZipFile("test.ofd", "w", zipfile.ZIP_DEFLATED)
+            for path, dirnames, filenames in os.walk(temp_dir):
+                # 去掉目标跟路径,只对目标文件夹下边的文件及文件夹进行压缩
+                fpath = path.replace(temp_dir, '')
+
+                for filename in filenames:
+                    zip.write(os.path.join(path, filename), os.path.join(fpath, filename))
+            zip.close()
+            with open("test.ofd", "rb") as f:
+                content = f.read()
+            if os.path.exists("test.ofd"):
+               os.remove("test.ofd")
+            return content
+
+if  __name__ == "__main__":
+    print("---------")
+    # 资源文件
+    img_path = r"F:\code\easyofd\test\test_img0.jpg"
+    # with open(img_path, "rb") as f:
+    #     content = f.read()
+    content = b""
+    res_static = {"Image_0.jpg": content}
+
+    # 构建数据
+    font = [
+            {
+
+                "@FontName": "宋体",
+                "@FamilyName": "宋体",
+
+            }
+            ]
+
+    MultiMedia = [
+                {
+
+                    "@Type": "Image",
+                    "ofd:MediaFile": "Image_0.jpg"
+                }
+            ]
+
+    ImageObject = [{
+
+                        "@CTM": "200 0 0 140 0 0",
+                        "@Boundary": "0 0 200 140",
+                        "@ResourceID": "55"
+                    }]
+    TextObject = [
+        {
+
+
+        "@Boundary": "50 5 100 20",
+        "@Font": "2",
+        "@Size": "5",
+        "ofd:FillColor": {
+
+            "@Value": "156 82 35",
+            "@ColorSpace" : "1"
+        },
+
+        "ofd:TextCode": {
+            "@X": "5",
+            "@Y": "5",
+            "@DeltaX": "7 7 7 7 7 7 7 7 7",
+            "#text": "电⼦发票(普通发票)"
+        }
+    }, {
+
+
+        "@Boundary": "0 0 100 100",
+        "@Font": "2",
+        "@Size": "10",
+        "ofd:FillColor": {
+
+            "@Value": "156 82 35"
+        },
+
+        "ofd:TextCode": {
+            "@X": "0",
+            "@Y": "0",
+            "@DeltaX": "0",
+            "#text": "电"
+        }
+    }
+    ]
+
+    # 实例化模板
+    id_obj = CurId()
+    print("id_obj实例化", id_obj)
+
+    ofd = OFDTemplate(id_obj=id_obj)
+    document = DocumentTemplate(id_obj=id_obj)
+    public_res = PublicResTemplate(Font=font, id_obj=id_obj)
+    document_res = DocumentResTemplate(MultiMedia=MultiMedia, id_obj=id_obj)
+    # ImageObject=ImageObject
+    content_res = ContentTemplate(CGTransform=[], PathObject=[], TextObject=TextObject, ImageObject=[], id_obj=id_obj)
+
+
+
+    ofd_byte = OFDStructure("123",ofd=ofd, document=document,public_res=public_res,
+                            document_res=document_res, content_res=[content_res], res_static=res_static)(test=True)
+
+    with open("test.ofd", "wb") as f:
+        content = f.write(ofd_byte)

+ 966 - 0
format_convert/easyofd/easyofd/draw/pdf_parse.py

@@ -0,0 +1,966 @@
+import os
+import re
+import io
+
+import json
+import time
+import copy
+import string
+import random
+from uuid import uuid1
+from decimal import Decimal
+from collections import OrderedDict
+
+# 第三方包
+import fitz
+from PIL import Image
+# import pdfplumber
+
+__ALL__ = ['pdf_ocr',"DPFParser"]
+
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, bytes):
+            return str(obj)
+        elif isinstance(obj, Decimal):
+            return float(obj)
+        return json.JSONEncoder.default(self, obj)
+
+class DPFParser(object):
+    def __init__(self, ):
+        pass
+
+    def extract_text_with_details(self, pdf_bytes):
+        """
+        提取PDF每页的文本及其位置、字体信息。
+
+        :param pdf_path: PDF文件路径
+        :return: 包含每页文本及其详细信息的列表
+        [[
+
+        ]]
+        """
+        details_list = []
+        pdf_stream = io.BytesIO(pdf_bytes)
+
+        # 使用fitz.open直接打开BytesIO对象
+
+        with fitz.open(stream=pdf_stream, filetype="pdf") as doc:
+            res_uuid_map = {
+                "img": {},
+                "font": {},
+                "other": {}
+            } # 全局资源标识
+            for page_num in range(len(doc)):
+
+
+                page_details_list = []  # 页面内信息
+                page = doc.load_page(page_num)
+                rect = page.rect
+                width = rect.width
+                height = rect.height
+                if res_uuid_map["other"].get("page_size"):
+                    res_uuid_map["other"]["page_size"][page_num] = [width,height]
+                else :
+                    res_uuid_map["other"]["page_size"] = {page_num: [width, height]}
+                blocks = page.get_text("dict").get("blocks")  # 获取文本块信息
+                image_list = page.get_images(full=True)  # 获取页面上所有图片的详细信息
+                # print(blocks)
+                # 获取页面内文本信息
+                for block in blocks:
+                    block_text = block.get("text", "")
+                    block_rect = block["bbox"]  # 文本块的边界框,格式为[x0, y0, x1, y1]
+
+                    # 遍历块中的每一行
+                    for line in block.get("lines", []):
+                        line_text = line.get("spans", [{}])[0].get("text", "")  # 单行文本
+                        line_rect = line["bbox"]  # 行的边界框
+
+                        # 遍历行中的每一个跨度(span),获取字体信息
+                        for span in line.get("spans", []):
+                            span_text = span.get("text", "")
+                            font_size = span.get("size")  # 字体大小
+                            font_name = span.get("font")  # 字体名称
+                            res_uuid = None
+                            if font_name not in res_uuid_map["font"].values():
+                                res_uuid = str(uuid1())
+                                res_uuid_map["font"][res_uuid] = font_name
+                            else:
+                                keys = list(res_uuid_map["font"].keys())
+                                vs = list(res_uuid_map["font"].values())
+                                idx = vs.index(font_name)
+                                res_uuid =keys[idx]
+                            font_color = span.get("color")  # 字体颜色,默认可能没有
+                            span_rect = (
+                            line_rect[0], line_rect[1], line_rect[2], line_rect[3])  # 使用行的边界框作为参考,具体到单个字符或词可能需要更复杂的处理
+
+                            # 打印或存储信息
+                            print(
+                                f"Page: {page_num }, Text: '{span_text}', Font: {font_name}, Size: {font_size}, "
+                                f"Color: {font_color}, Rect: {span_rect} ,res_uuid {res_uuid}")
+
+                            # 存储信息到details_list中(根据需要调整存储格式)
+                            page_details_list.append({
+                                "page": page_num,
+                                "text": span_text,
+                                "font": font_name,
+                                "res_uuid": res_uuid,
+                                "size": font_size,
+                                "color": font_color,
+                                "bbox": list(span_rect),
+                                "type": "text"
+                            })
+
+                for image_index, img_info in enumerate(image_list):
+                    # 解析图片信息
+                    xref = img_info[0]
+                    base_image = doc.extract_image(xref)
+
+                    image_data = base_image["image"]  # 图片数据
+                    res_uuid = str(uuid1())
+
+                    img_io = io.BytesIO(image_data)
+                    res_uuid_map["img"][res_uuid] = img_io
+                    image_type = base_image["ext"]  # 图片类型
+                    smask = base_image["smask"]  # 图片类型
+                    xres = base_image["xres"]  # 图片类型
+                    yres = base_image["yres"]  # 图片类型
+                    width = base_image["width"]  # 图片宽度
+                    height = base_image["height"]  # 图片高度
+
+
+
+                    # 计算坐标(左下角和右上角)
+                    x0, y0, x1, y1 = xres, yres,xres+width,yres+height
+                    print(
+                        f"Page: {page_num}, image_type: '{image_type}',x0{x0}, y0{y0}, x1{x1}, y1{y1}  ")
+                    page_details_list.append({
+                        "page": page_num,
+                        "index": image_index,
+                        "x0": x0,
+                        "y0": y0,
+                        "x1": x1,
+                        "y1": y1,
+                        "bbox": [x0,y0,width,height],
+                        "width": width,
+                        "height": height,
+                        "res_uuid": res_uuid,
+                        "image_type": image_type,
+                        "type": "img"
+                    })
+
+                details_list.append(page_details_list)
+        # print("details_list",details_list)
+        return details_list, res_uuid_map
+    def to_img(self, buffer_pdf):
+        """pdf2img"""
+        pix_list = []
+        pdfDoc = fitz.open(stream=buffer_pdf)
+        for pg in range(pdfDoc.page_count):
+            page = pdfDoc[pg]
+            rotate = int(0)
+            # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
+            # 此处若是不做设置,默认图片大小为:792X612, dpi=96
+            zoom_x = 1.33333333 #(1.33333333-->1056x816)   (2-->1584x1224)
+            zoom_y = 1.33333333
+            # zoom_x,zoom_y = (1,1)
+            mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
+            pix = page.get_pixmap(matrix=mat, alpha=False)
+
+
+            pix_list.append(pix)
+        return pix_list
+           
+            
+            
+    def get_size(self):
+        pass
+    
+def coast_time(func):
+    '''
+    计算对象执行耗时
+    '''
+    def fun(*agrs, **kwargs):
+        t = time.perf_counter()
+        result = func(*agrs, **kwargs)
+        print(f'function {func.__name__} coast time: {time.perf_counter() - t:.8f} s')
+        return result
+    return fun
+
+
+class BaseInit:
+    '''
+    解析pdf所需的基本信息
+    '''
+
+    def __init__(self, pdf_path, output_path):
+
+        self.file_path = pdf_path
+        self.output_path = output_path
+        # file_name
+        self.file_name = os.path.basename(self.file_path)
+        # file_type
+        self.fileType = os.path.splitext(self.file_path)[-1]
+        # no suffix
+        self.file_no_suffix = self.file_name[:-len(self.fileType)]
+        self.uuidChars = tuple(list(string.ascii_letters) + list(range(10)))
+        # 表格占位、分割符
+        self.divide = ':'
+        self.solid = ''
+        # 初始化整个过程需要创建的中间目录
+        # iou 占比
+        self.iou_rate = 0.001
+        self.init_file()
+
+    def init_file(self):
+        """
+        初始化项目过程中需要创建的文件夹
+        """
+        self.image_folder_path = os.path.join(self.output_path, 'pdf_img_save')
+        self.json_folder_path = os.path.join(self.output_path, 'json')
+        self.ocr_result_path = os.path.join(self.json_folder_path, self.file_no_suffix + '.json')
+        # 后面还有txt..., 目前的流程先需要5个
+        for path in [self.image_folder_path, self.json_folder_path]:
+            if not os.path.exists(path):
+                os.makedirs(path)
+
+    def genShortId(self, length=12):
+        """
+        :params length: 默认随机生成的uuid长度
+        """
+        uuid = str(uuid1()).replace('-', '')
+        result = ''
+        for i in range(0, 8):
+            sub = uuid[i * 4: i * 4 + 4]
+            x = int(sub, 16)
+            result += str(self.uuidChars[x % 0x3E])
+        return result + ''.join(random.sample(uuid, length - 8))
+
+
+class PageInfo(BaseInit):
+    '''
+    记录每页中的 图片和表格信息
+    '''
+    __page_image = {}
+    __page_table = {}
+
+    @classmethod
+    def add_image(cls, page_num, image):
+        if not cls.__page_image.get(page_num):
+            cls.__page_image[page_num] = []
+        cls.__page_image[page_num].append(image)
+
+    @classmethod
+    def add_table(cls, page_num, table):
+        if not cls.__page_table.get(page_num):
+            cls.__page_table[page_num] = []
+        cls.__page_table[page_num].append(table)
+
+    @classmethod
+    def get_image(cls, page_num):
+        return cls.__page_image.get(page_num, [])
+
+    @classmethod
+    def get_table(cls, page_num):
+        return cls.__page_table.get(page_num, [])
+
+    @classmethod
+    def save_image(cls, output_path, file):
+        '''
+        保存图片至本地
+        :param output:
+        :return:
+        '''
+        file = file.split('.')[0]
+        for images in cls.__page_image.values():
+            for image in images:
+                iamge_content = image['objContent']
+                name = image['name']
+                img_dir = os.path.join(output_path, 'page_img_save')
+                img_path = os.path.join(img_dir, file + '_' + name + '.jpg')
+                if not os.path.exists(img_dir):
+                    os.mkdir(img_dir)
+                with open(img_path, 'wb') as fp:
+                    fp.write(iamge_content)
+
+
+class ParseFile(PageInfo):
+
+    def __init__(self, pdf_path, output_path, table_type='v2', is_save=True):
+        super().__init__(pdf_path, output_path)
+        print('初始化 pdf 对象:{}'.format(self.file_path))
+        self.is_save = is_save
+        self.table_type = table_type
+        # 第一版结果列表: 行 表分开
+        self.page_result_list = []
+        # 第二版结果列表: 行表合并
+        self.combine_page_result_list = []
+
+    @coast_time
+    def get_result(self):
+        self.load_pdf()
+        result = self.parse_pdf()
+        self.ocr_result = result
+        print(f'解析完成:共 {len(result)} 页  表格类型: {self.table_type}')
+        return result
+
+    def load_pdf(self):
+        self.fitz_doc = fitz.open(self.file_path, filetype='pdf')
+        # self.pdfplum_doc_pages = pdfplumber.open(self.file_path).pages
+        # assert len(self.fitz_doc) == len(self.pdfplum_doc_pages)
+
+    def parse_pdf(self):
+        for page_no, fitz_doc in enumerate(self.fitz_doc):
+            # 测试
+            # if page_no != 25:
+            #     continue
+            self.height = fitz_doc.get_text('dict')['height']
+            self.width = fitz_doc.get_text('dict')['width']
+            # 聚合fitz页面解析的字符, 行, 块信息
+            line_list = self.group_block(page_no, fitz_doc)
+            # 获取页面表格信息
+            table_list = self.extract_table(page_no, self.pdfplum_doc_pages[page_no])
+            # 计算表格行列合并信息
+            table_list = list(CalcTableRL(table_list).run())
+            # 获取页面图片信息
+            image_list = self.get_image(page_no)
+            # 构造每页最终返回结果,
+            page_result = self.construct_final_result(line_list, page_no, image_list, table_list)
+
+            if self.table_type == 'v2':
+                # 合并成ocr所需格式:表格合并至行列表
+                combine_page_result_list = self.combine_table_v2(page_result)
+                page_result = self.construct_final_result(combine_page_result_list, page_no, image_list, table_list)
+
+            self.page_result_list.append(page_result)
+            if page_no and  page_no % 10 == 0:
+                print(f'解析前 {page_no} 页完成')
+        final_result_list = copy.deepcopy(self.page_result_list)
+        # 转换为符合ocr解析格式
+        if self.table_type == 'v2':
+            final_result_list = self.reform_ocr_result(final_result_list)
+        # 2023/09/26 保存之前加入 contIndex 给后续 抽取模型使用
+        for page_num, page in enumerate(final_result_list):
+            if not page.get('lineList'):
+                break
+            contIndex = {}
+            for line in page['lineList']:
+                line_bak = dict(copy.copy(line))
+                line_bak["objType_postpreprocess"] = f"{line_bak.get('objType','textLine')}_postpreprocess"
+                contIndex[line_bak["lineId"]] = line_bak
+            
+            page["contIndex"] = contIndex
+            for line in page['lineList']:
+                print(page_num, line['objType'], line['objContent'])
+        # 保存至本地
+        if self.is_save:
+            self.save_result(final_result_list)
+        for page_num, page in enumerate(final_result_list):
+            for line in page['lineList']:
+                print(page_num, line['objType'], line['objContent'])
+        return final_result_list
+
+    def combine_table_v2(self, page_result):
+        lineList = page_result['lineList']
+        table_list = page_result['table_list']
+        # 先进行表格行、非表格行划分 减少后续操作的时间杂度
+        __notable_lines, __all_table_lines = self.filter_table_line(lineList, table_list)
+        notable_lines, all_table_lines = copy.deepcopy(__notable_lines), copy.deepcopy(__all_table_lines)
+        del __notable_lines, __all_table_lines, lineList
+        # 整合
+        combine_page_result_list = self.combine_table_with_line(notable_lines, all_table_lines, table_list)
+        return combine_page_result_list
+
+    def filter_table_line(self, lineList, table_list):
+        '''
+        筛选出属于表格的行、在 __notable_lines 属于表格的位置插庄 方便后续补全
+        __notable_lines: 非表格的行
+        __all_table_lines:属于表格的行
+        '''
+        __notable_lines = []
+        __all_table_lines = []
+        for table_info in table_list:
+            table_bbox = table_info['objPos']
+            # 属于当前表格的所有行
+            __sub_table_lines = []
+            is_iter_table = False
+            while lineList:
+                line = lineList.pop(0)
+                line_bbox = line['objPos']
+                # 空表格误判:行Y坐标已经超过表范围导致后续全都识别不到
+                table_y, line_y = table_bbox[3], line_bbox[1]
+                if line_y >= table_y:
+                    lineList.insert(0, line)
+                    break
+                iou = self.count_iou(table_bbox, line_bbox)
+                # 非表格区域
+                if iou > 0:
+                    __sub_table_lines.append(line)
+                    # 首次匹配到表格行
+                    if not is_iter_table:
+                        is_iter_table = True
+                        # 插入标记
+                        __notable_lines.append('table')
+                elif iou <= 0 and not is_iter_table:
+                    __notable_lines.append(line)
+                # 当前表格判断结束
+                elif iou <= 0 and is_iter_table:
+                    lineList.insert(0, line)
+                    line_index, flag = self.more_judge(table_bbox, lineList)
+                    if flag:
+                        # 跳至index位置继续后续判断
+                        # more_lines = copy.deepcopy()
+                        __notable_lines.extend(lineList[:line_index])
+                        lineList = lineList[line_index:]
+                    else:
+                        break
+            __all_table_lines.append(__sub_table_lines)
+        # 表格遍历替换完毕, 合并剩下的 page_words
+        if lineList:
+            __notable_lines.extend(lineList)
+        return __notable_lines, __all_table_lines
+
+    def more_judge(self, table_bbox, lineList, max_judge=6):
+        '''
+        判断后续行列表是否还存在属于当前表格的行
+        对于表格、行界限不明显的额外判断 如: 页面分栏、表格不全
+        :return 是否存在 True | False
+        '''
+        # 往后多判断 max_judge 行
+        if len(lineList) < max_judge:
+            judge_lines = lineList
+        else:
+            judge_lines = lineList[:max_judge]
+        for index, line in enumerate(judge_lines):
+            line_bbox = line['objPos']
+            iou = self.count_iou(table_bbox, line_bbox)
+            if iou > 0:
+                return index, True
+        return index, False
+
+
+    def combine_table_with_line(self, notable_lines, all_table_lines, table_list):
+        '''
+        将行、字符合并至对应的表格行、cell
+        '''
+        for table_id, table in enumerate(table_list):
+            new_table_lines = []
+            for table_line in table['lineList']:
+                is_iter_table = False
+                table_line_bbox = table_line['objPos']
+                # 遍历每一行:全局匹配
+                for __line in all_table_lines[table_id]:
+                    line = copy.deepcopy(__line)
+                    line_bbox = line['objPos']
+                    iou = self.count_iou(table_line_bbox, line_bbox)
+                    # 首次识别到表格, 将文本行的文本、坐标替换为表格行文本、坐标,文本行的其他信息不变
+                    if iou > self.iou_rate and not is_iter_table:
+                        is_iter_table = True
+                        line['objContent'] = table_line['objContent']
+                        line['objPos'] = table_line['objPos']
+                        line['objType'] = 'table'
+                        line['tableId'] = table_id
+                        self.combine_cell_with_span(table_line, line)
+                        line['cells'] = table_line['cells']
+                        new_table_lines.append(line)
+                    elif iou > self.iou_rate and is_iter_table:
+                        self.combine_cell_with_span(table_line, line)
+                    else:
+                        pass
+            if 'table' not in notable_lines or not new_table_lines:
+                # FIX ERROR: 'table' is not in list
+                # 处理大表格内识别到小表格的情况
+                # 有可能的bug:如果此时有多个大表格嵌套会导致行分配和插庄个数不对等
+                continue
+            # 将表格行new_table_lines替换之前插庄table位置并展开
+            table_index = notable_lines.index('table')
+            new_notable_lines = notable_lines[:table_index]
+            new_notable_lines.extend(new_table_lines)
+            notable_lines = new_notable_lines + notable_lines[table_index+1:]
+        return notable_lines
+
+    def combine_cell_with_span(self,table_line , text_line):
+        '''
+        将表格的cell内加上对应span的chars信息:解决表格合并时cell有多行导致chars顺序错乱的问题
+        '''
+        del_list = []
+        for index, cell in enumerate(table_line['cells']):
+            if not cell.get('chars'):
+                cell['chars'] = []
+            cell_bbox = cell['objPos']
+            if cell_bbox is None:
+                del_list.append(index)
+                continue
+            for span in  text_line['span']:
+                span_bbox = span['bbox']
+                iou = self.count_iou(cell_bbox, span_bbox)
+                if iou < self.iou_rate:
+                    continue
+                # 为了解决一些 span 和 cell 长度不一致问题 将循环细分到每个字符chars
+                for char in span['chars']:
+                    char_bbox = char['bbox']
+                    iou = self.count_iou(cell_bbox, char_bbox)
+                    if iou > self.iou_rate:
+                        cell['chars'].append(char)
+                    else:
+                        pass
+        # 清除无效的span
+        if len(del_list):
+            for index, index_del in enumerate(del_list):
+                index_del -= index
+                del table_line['cells'][index_del]
+
+    def group_block(self, page_num, fitz_doc):
+        """
+        组合两个方法的block信息, 使每一个span内具有其每一个字符信息
+        参考官方文档:https://pymupdf.readthedocs.io/en/latest/textpage.html#textpagedict
+        :param fitz_doc:
+        :return: total_info
+        """
+        line_count = 0
+        total_line_list = []
+        # char_blocks 最小粒度为每一个字符
+        char_blocks = fitz_doc.get_text('rawdict')['blocks']
+        # block_blocks 最小粒度为每行中的span
+        block_blocks = fitz_doc.get_text('dict')['blocks']
+        # 先进行文本块排序
+        char_blocks.sort(key=lambda x: [int(x['bbox'][1]), int(x['bbox'][0])])
+        block_blocks.sort(key=lambda x: [int(x['bbox'][1]), int(x['bbox'][0])])
+        # 分组聚合
+        group_blocks = zip(block_blocks, char_blocks)
+        for span_blocks, char_block in group_blocks:
+            if span_blocks['type'] == 1:
+                # 保存其中的图片
+                img_attrs = self.deal_image(page_num, line_count, span_blocks)
+                self.add_image(page_num, img_attrs)
+                continue
+            for line_index, line in enumerate(span_blocks['lines']):
+                line['text'] = ''
+                line['chars'] = []
+                line['span'] = []
+                # 减少时间复杂度,在此处合并每一行
+                # 合并每一行,并附上行内每一个字符的信息
+                for span_index, span in enumerate(line['spans']):
+                    span['text'] = span['text'].replace(' ', '').strip()
+                    if not span['text']:
+                        continue
+                    # 给span_blocks中的span加上char_block的chars信息
+                    span_chars = char_block['lines'][line_index]['spans'][span_index]['chars']
+                    span_chars = [char for char in span_chars if char['c'].strip()]
+                    line['text'] += span['text']
+                    line['chars'].extend(span_chars)
+                    line['span'].append({'bbox': span['bbox'], 'chars': span_chars,'text': span['text']})
+                if not line['text']:
+                    continue
+                # 构造每行内部的数据结构
+                line_info = self.construct_line_info(line['text'], line['bbox'], line['span'], line['chars'],
+                                                     line_count, page_num)
+                total_line_list.append(line_info)
+                line_count += 1
+        return total_line_list
+
+    def extract_table(self, page_no, plum_page):
+        '''
+        提取页面所有表格
+        :param page_no:
+        :param plum_page:
+        :return:
+        '''
+        table_list = []
+        for table in plum_page.find_tables():
+            # 获取当前表格的边界定位
+            table_line_list = self.merge_table_row(table)
+            if not table_line_list:
+                continue
+            table_info = self.deal_table(page_no, table.bbox, table_line_list)
+            table_list.append(table_info)
+            # 将表格信息加入全局变量 | 此处有点有点冗余
+            self.add_table(page_no, table_info)
+        return table_list
+
+    def merge_table_row(self, table):
+        '''
+        表格cell 按行合并
+        :param table:
+        :return: [({line_text}, {line_bbox}), ...]
+        '''
+        table_line_list = []
+        for item, row in zip(table.extract(), table.rows):
+            # 表格每行预处理
+            table_line = self.divide.join([self.clear_text(txt) for txt in item])
+            # 判断当前行是否为空
+            __line = self.clear_text(table_line).replace(' ', '')
+            if not __line:
+                continue
+            table_line_list.append((table_line, row.bbox, zip(item, row.cells)))
+        return table_line_list
+
+    def clear_text(self, txt, retrans=False):
+
+        if retrans:
+            txt = txt.replace(self.solid, '').replace(self.divide, '')
+        else:
+            # 空列替换为占位符
+            txt = txt if txt else self.solid
+        return str(txt).replace('\n', '').replace(' ', '')
+
+    def deal_table(self, page_no, table_bbox, table_line_list):
+        '''
+        对表格做结构转换
+        :param page_no:
+        :param table_bbox:
+        :param table_line_list:
+        :return:
+        '''
+        table_first_line = self.clear_text(table_line_list[0][0], retrans=True)
+        table_id = '{0}_{1}_'.format(page_no, table_first_line) + self.genShortId()
+        lineList = [{
+            'objContent': line[0],
+            'objPos': line[1],
+            'cells': self.deal_table_cell(line[2])
+        } for line in table_line_list]
+        table_info = {
+            'tableId': table_id,
+            'name': table_id,
+            'objPos': table_bbox,
+            'lineList': lineList,
+        }
+        return table_info
+
+    def deal_table_cell(self, cells):
+        return [{"objContent": self.clear_text(text), "objPos": box} for text, box in cells]
+
+    def deal_image(self, page_num, name, img_attrs):
+        '''
+        对image做结构转换
+        :param page_num:
+        :param name:
+        :param img_attrs:
+        :return:
+        '''
+        image_id = '{0}_{1}_'.format(page_num, name) + self.genShortId()
+        img_info = {
+            'imageId': image_id,
+            'name': image_id,  # 暂时以图片所在页面的行数命名
+            'objPos': img_attrs['bbox'],
+            'ext': img_attrs['ext'],
+            'objContent': img_attrs['image'],
+            'size': img_attrs['size']
+        }
+        return img_info
+
+    def deal_chars(self, line_num, lineId, chars):
+        '''
+        对chars做结构转换
+        :param line_num:
+        :param lineId:
+        :param chars:
+        :return:
+        '''
+        num_count = 0
+        char_list = []
+        for char in chars:
+            if not char['c'].strip():
+                continue
+            char_dict = {
+                'lineId': lineId,
+                'charId': 'char_' + str(line_num) + '_' + str(num_count) + '_' + self.genShortId(),
+                'objContent': char['c'],
+                'objPos': char['bbox']
+            }
+            char_list.append(char_dict)
+            num_count += 1
+        return char_list
+
+    def construct_line_info(self, text, rect, span, chars, count, pageNo, objType='textLine'):
+        '''
+        对每行做结构转换
+        # x, y, h, w = rect[0], rect[1], rect[3] - rect[1], rect[2] - rect[0]
+        '''
+        lineId = 'line_' + str(pageNo) + '_' + str(count) + '_' + self.genShortId()
+        chars = self.deal_chars(count, lineId, chars)
+        return OrderedDict({
+            'lineNo': count,
+            'lineId': lineId,
+            'objType': objType,
+            'objContent': re.sub(r'\s', '', text),
+            'chars': chars,
+            'objPos': rect,
+            'span': span
+        })
+
+    @staticmethod
+    def rect_format(bbox):
+        '''
+        数据坐标转换 x1, y1, x2, y2 >> y1, x1 h, w
+        :param rect: [x1, y1, x2, y2]
+        :return: [y, x, h, w]
+        '''
+        y, x, h, w = bbox[1], bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0]
+        return [y, x, h, w]
+
+    def count_iou(self, RecA, RecB):
+        '''
+        计算边框交并比
+        左上边界坐标为Ax0, Ay0, Bx0, By0
+        右下边界坐标为Ax1, Ay1, Bx1, By1
+        交集面积计算为:
+            M = min(Ax1, Bx1) - max(Ax0, Bx0)
+            H = min(Ay1, By1) - max(Ay0, By0)
+        # 当前表格的边界信息
+        left_x, top_y, right_x, botm_y: table_box_info[0], table_box_info[1], table_box_info[2], table_box_info[3]
+        '''
+        M = min(RecB[2], RecA[2]) - max(RecB[0], RecA[0])
+        H = min(RecB[3], RecA[3]) - max(RecB[1], RecA[1])
+
+        # 计算交集部分面积
+        interArea = max(0, M) * max(0, H)
+
+        # 计算两个边框的面积
+        RecA_Area = (RecA[2] - RecA[0]) * (RecA[3] - RecA[1])
+        RecB_Area = (RecB[2] - RecB[0]) * (RecB[3] - RecB[1])
+        # 计算IOU
+        iou = interArea / float(RecA_Area + RecB_Area - interArea)
+        return iou
+
+    def construct_final_result(self, line_list, pageNo, image_list=[], table_list=[]):
+        '''
+        每页转换为最终数据结构
+        :param line_list: ocr每行结果
+        :param pageNo: 页码
+        :param image_list:
+        :param table_list:
+        :return: type: Dict
+        '''
+        document_id = 'v1' + '_' + self.file_no_suffix + '_' + self.genShortId()
+        return OrderedDict({
+            'pageNo': pageNo,
+            'docID': document_id,
+            'page_info':{'size': [self.width, self.height]},
+            'lineList': line_list,
+            'image_list': image_list if image_list else [],
+            'table_list': table_list if table_list else []
+        })
+
+    def save_result(self, final_result_list):
+        '''
+        保存结果数据至本地
+        '''
+        if self.table_type == 'v2':
+            with open(self.ocr_result_path, 'w', encoding='utf-8') as f:
+                json.dump(final_result_list, f, indent=4, ensure_ascii=False)
+        else:
+            with open(self.ocr_result_path, 'w', encoding='utf-8') as f:
+                json.dump(self.page_result_list, f, cls=MyEncoder, indent=4, ensure_ascii=False)
+
+    def reform_ocr_result(self, final_result_list):
+        """
+        对返回的结果最最终处理 并 重新定义行号排序
+        :param final_result_list: 本地解析和ocr解析的合并结果
+        """
+        for result_list in final_result_list:
+            del result_list['image_list']
+            del result_list['table_list']
+            lineList = result_list['lineList']
+            for num, line in enumerate(lineList):
+                # 重写行号和行ID
+                line['lineNo'] = str(num)
+                line_split = line['lineId'].split('_')
+                line_split[-2] = str(num)
+                line['lineId'] = '_'.join(line_split)
+                # 转换坐标格式
+                obj_type = line['objType']
+                # 计算每一个字相对于当前行想x,y 的偏移量
+                offset_x_list, offset_y_list = self.coord_offset(line, obj_type)
+                line['objPos'] = self.rect_format(line['objPos'])
+                line['objPos'].append(offset_x_list)
+                line['chars_offset'] = [offset_x_list, offset_y_list]
+                if line.get('chars'):
+                    del line['chars']
+                if obj_type == 'table' and line.get('span'):
+                    del line['span']
+        return final_result_list
+
+    def coord_offset(self, line, obj_type='textLine'):
+        '''
+        计算每个字符的左上角 相对行左上角位置的偏移量
+        @obj_type: textLine | table
+        '''
+        offset_x_list = []
+        offset_y_list = []
+        line_x, line_y = line['objPos'][0], line['objPos'][1]
+        if obj_type == 'textLine':
+            for span in line['span']:
+                self.all_rect_format(span)
+                for char in span['chars']:
+                    char_x, char_y = char['bbox'][0], char['bbox'][1]
+                    offset_x_list.append(char_x - line_x)
+                    offset_y_list.append(char_y - line_y)
+                    self.all_rect_format(char)
+        else:
+            __cells = []
+            for num, _cell in enumerate(line['cells']):
+                cell = copy.deepcopy(_cell)
+                self.all_rect_format(cell)
+                for char in cell['chars']:
+                    char_x, char_y = char['bbox'][0], char['bbox'][1]
+                    offset_x_list.append(char_x - line_x)
+                    offset_y_list.append(char_y - line_y)
+                    self.all_rect_format(char)
+                __cells.append(cell)
+            line['cells'] = __cells
+        return offset_x_list, offset_y_list
+
+    def all_rect_format(self, obj):
+        '''
+        将所有格式转换为ocr所需格式
+        '''
+        if 'chars' in obj:
+            if obj.get('text'):
+                obj['objContent'] = obj['text']
+                del obj['text']
+            if obj.get('objPos'):
+                obj['objPos'] = self.rect_format(obj['objPos'])
+            elif obj.get('bbox'):
+                obj['objPos'] = self.rect_format(obj['bbox'])
+                del obj['bbox']
+        else:
+            obj['objContent'] = obj['c']
+            obj['objPos'] = self.rect_format(obj['bbox'])
+            del obj['c']
+            del obj['bbox']
+
+class CalcTableRL:
+    '''
+    还原表格虚线 计算表格行列合并信息
+    输入目标表格结构信息:必须包含所有的cell坐标
+    在目标表格结构cell上加上row_start_end, col_start_end属性
+    '''
+    def __init__(self, table_info):
+        self.table_info = table_info
+
+    def run(self):
+        if isinstance(self.table_info, list):
+            for table_info in self.table_info:
+                table_info = self.add_table_property(table_info)
+                yield table_info
+        else:
+            table_info = self.add_table_property(self.table_info)
+            yield table_info
+    def add_table_property(self, table_info):
+        '''
+        表格结构增加行列合并信息:
+        cell['col_start_end'] = (col_start, col_end)
+        cell['row_start_end'] = (row_start, row_end)
+        '''
+        # 分别得到所有排序好的行列坐标
+        set_x, set_y = self.collect_table_coord(table_info)
+        # 排序 后的set_x,set_y 坐标集合就是最小粒度表格
+        list_x, list_y = sorted(set_x), sorted(set_y)
+        for line in table_info['lineList']:
+            for cell in line['cells']:
+                if cell['objPos'] == None:
+                    continue
+                x1, y1, x2, y2 = cell['objPos']
+                # 查找坐标点在虚线表格中对应的位置
+                col_start = list_x.index(x1)
+                col_end = list_x.index(x2)
+                row_start = list_y.index(y1)
+                row_end = list_y.index(y2)
+                cell['col_start_end'] = (col_start, col_end)
+                cell['row_start_end'] = (row_start, row_end)
+                # print(f"{cell['objContent']} 属于行:{cell['row_start_end']} 属于列:{cell['col_start_end']}")
+        return table_info
+
+    def collect_table_coord(self, table_info):
+        '''
+        获取所有x, y坐标点
+        传入单个表格信息,提取出其中所有cell的x1, y1, x2, y2坐标点 去重
+        :param table_info:
+        :return: set(x), set(y)
+        '''
+        set_x = set()
+        set_y = set()
+        for line in table_info['lineList']:
+            for cell in line['cells']:
+                if cell['objPos'] == None:
+                    continue
+                x1, y1, x2, y2 = cell['objPos']
+                set_x.add(x1)
+                set_x.add(x2)
+                set_y.add(y1)
+                set_y.add(y2)
+        return set_x, set_y
+
+
+
+def pdf_ocr(pdf_path, output_path, table_type='v2', is_save=True):
+    '''
+    简单封装, 方便调用和多线程
+    '''
+    pdf = ParseFile(pdf_path, output_path, table_type, is_save)
+    pdf.get_result()
+    return pdf
+
+# ---------------------------以下是测试案列-----------------------------------
+
+@coast_time
+def test_dir():
+    for root in os.walk(r'E:\workplace\cjhx_test\创金和信\pdf2json\input\all_test'):
+        dir, files = root[0], root[2]
+        for file in files:
+            if 'test.pdf' not in file:
+                continue
+            file_path = os.path.join(dir, file)
+            output_dir = r'E:\workplace\cjhx_test\创金和信\pdf2json\file_data\all_test'
+            pdf_ocr_result = pdf_ocr(file_path, output_dir)
+
+@coast_time
+def test_single():
+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\测试足够复杂的表格解析.pdf'
+    file_path = r'/home/yhocr/extractor/3f195fba-0916-4d74-b956-bf3bcadc77f2/20220913-浙江省贰号职业年金计划银华资产组合2022年二季度管理费用支付指令.pdf'
+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\公开募集基金销售支付结算机构名录(2022年9月)(1).pdf'
+    # file_path = r'C:\Users\Administrator\Documents\WeChat Files\wxid_x36dhycno4s121\FileStorage\File\2022-11\20210928-ZL001-西部利得天添鑫货币B-申购5000万-确认书.pdf'
+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\2-信息系统部2021年大数据平台系统维护服务--工作记录表和考核表2021Q3-原版.pdf'
+    output_dir = r'/home/yhocr/extractor/3f195fba-0916-4d74-b956-bf3bcadc77f2/电子解析'
+    pdf = pdf_ocr(file_path, output_dir, table_type='v2')
+    # print(pdf.ocr_result)
+
+@coast_time
+def test_thread():
+    # 多进程
+    from concurrent.futures import ProcessPoolExecutor
+    pool = ProcessPoolExecutor(max_workers=8)
+    # 多线程
+    # from concurrent.futures import ThreadPoolExecutor
+    # pool = ThreadPoolExecutor(max_workers=8)
+    for root in os.walk(r'E:\workplace\daily_work\pdf2json\input\签字模板二'):
+        dir, files = root[0], root[2]
+        for file in files:
+            file_path = os.path.join(dir, file)
+            output_dir = r'E:\workplace\daily_work\pdf2json\output\签字模板二'
+            ret = pool.submit(pdf_ocr, file_path, output_dir, table_type='v2')
+            ret.add_done_callback(print_callback)
+    pool.shutdown()
+
+def print_callback(ret):
+    # print('ret:', ret.result())
+    pass
+
+if __name__ == '__main__':
+    # test_dir()
+    # test_thread()
+    # test_single()
+    pdf_obj = DPFParser()
+    with open(r"F:\code\easyofd\test\test.pdf","rb") as f:
+        pdf_bytes = f.read()
+
+    img_list = pdf_obj.to_img(pdf_bytes)
+    pil_img_list = []
+    for _img in img_list:
+        print(_img.width,_img.height)
+        img = Image.frombytes("RGB", [_img.width, _img.height], _img.samples)
+        print(type(img))
+        img.save('output_image.png')
+      
+    

BIN
format_convert/easyofd/easyofd/draw/simsun.ttc


+ 301 - 0
format_convert/easyofd/easyofd/ofd.py

@@ -0,0 +1,301 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: F:\code\easyofd\easyofd
+# CREATE_TIME: 2023-10-07
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# note:  ofd 基础类
+import base64
+import os
+import sys
+from io import BytesIO
+from typing import Union
+
+# sys.path.insert(0, os.getcwd())
+# sys.path.insert(0, "..")
+
+import fitz
+from PIL import Image
+from fontTools.ttLib import TTFont
+from loguru import logger
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../../../")
+
+from format_convert.easyofd.easyofd.parser_ofd import OFDParser
+from format_convert.easyofd.easyofd.draw import DrawPDF, OFDWrite
+
+
+class OFD(object):
+    """ofd对象"""
+
+    def __init__(self, ):
+        self.data = None
+
+    def read(self, ofd_f: Union[str, bytes, BytesIO], fmt="b64", save_xml=False, xml_name="testxml", save_dir=None):
+        """_summary_
+        Args:
+            file (_type_): _description_
+            fomat (str, optional): _description_. Defaults to "path".
+            fomat in ("path","b64","binary")
+        """
+        if fmt == "path":
+            with open(ofd_f, "rb") as f:
+                ofd_f = str(base64.b64encode(f.read()), encoding="utf-8")
+        elif fmt == "b64":
+            pass
+        elif fmt == "binary":
+            ofd_f = str(base64.b64encode(ofd_f), encoding="utf-8")
+        elif fmt == "io":
+            ofd_f = str(base64.b64encode(ofd_f.getvalue()), encoding="utf-8")
+        else:
+            raise "fomat Error: %s" % fmt
+
+        self.data = OFDParser(ofd_f)(save_xml=save_xml, xml_name=xml_name, save_dir=save_dir)
+
+    def save(self, ):
+        """
+        draw ofd xml
+        初始化一个xml 文件
+        self.data > file
+        """
+        assert self.data, f"data is None"
+
+    def pdf2ofd(self, pdfbyte, optional_text=False):
+        """pdf转ofd"""
+        assert pdfbyte, f"pdfbyte is None"
+        # logger.info(f"pdf2ofd")
+        ofd_byte = OFDWrite()(pdfbyte, optional_text=optional_text)
+        return ofd_byte
+
+    def to_pdf(self, return_need_convert_as_image=False):
+        """return ofdbytes"""
+
+        assert self.data, f"data is None"
+        # logger.info(f"to_pdf")
+        obj = DrawPDF(self.data)
+        result = obj()
+        if not return_need_convert_as_image:
+            return result
+        else:
+            return result, obj.page_need_to_image_dict
+
+    def pdf2img(self, pdfbytes):
+
+        image_list = []
+
+        doc = fitz.open(stream=pdfbytes, filetype="pdf")
+
+        for page in doc:
+            rotate = int(0)
+            zoom_x, zoom_y = 1.6, 1.6
+            zoom_x, zoom_y = 2, 2
+            mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
+            pix = page.get_pixmap(matrix=mat, alpha=False)
+            pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            # image = np.ndarray((pix.height, pix.width, 3), dtype=np.uint8, buffer=pix.samples)
+            # print(image.shape)
+            # print(image[2])
+            image_list.append(pil_image)
+        # logger.info(f"pdf2img")
+        return image_list
+
+    def jpg2ofd(self, imglist: list):
+        """
+        imglist: pil image list
+        """
+        ofd_byte = OFDWrite()(pil_img_list=imglist)
+        return ofd_byte
+
+    def jpg2pfd(self, imglist: list):
+        """
+        imglist: PIL image list
+        1 构建data 
+        2 DrawPDF(self.data)()
+        """
+
+        data = OFDParser(None).img2data(imglist)
+        return DrawPDF(data)()
+
+    def to_jpg(self, format="jpg"):
+        """
+        return pil list
+        """
+        assert self.data, f"data is None"
+        image_list = []
+        pdfbytes = self.to_pdf()
+        image_list = self.pdf2img(pdfbytes)
+        return image_list
+
+    def del_data(self, ):
+        """销毁self.data"""
+        self.data = None
+
+    def __del__(self):
+        del self
+
+    def disposal(self, ):
+        """销毁对象"""
+        self.__del__()
+
+
+def find_similar_characters():
+    similar_pairs = []
+    for code in range(0x4E00, 0x9FFF):  # 遍历常见的中文字符范围
+        char = chr(code)
+        try:
+            name = unicodedata.name(char)
+            if name.startswith('CJK COMPATIBILITY IDEOGRAPH'):
+                original_char = unicodedata.lookup(name.split()[-1])
+                similar_pairs.append((original_char, char))
+        except (ValueError, KeyError):
+            continue
+    return similar_pairs
+
+
+def save_chinese_characters(output_path):
+    with open(output_path, 'w', encoding='utf-8') as file:
+        # 遍历更多的中文字符范围
+        # for code in range(0x3400, 0x4DFF + 1):  # CJK Unified Ideographs Extension A
+        #     char = chr(code)
+        #     # if not unicodedata.category(char).startswith('P'):
+        #     file.write(char + '\n')
+        # for code in range(0x4E00, 0x9FFF + 1):  # 常见的中文字符范围
+        #     char = chr(code)
+        #     # if not unicodedata.category(char).startswith('P'):
+        #     file.write(char + '\n')
+        # for code in range(0xF900, 0xFAFF + 1):  # CJK Compatibility Ideographs
+        #     char = chr(code)
+        #     # if not unicodedata.category(char).startswith('P'):
+        #     file.write(char + '\n')
+        # for code in range(0x2F00, 0x2FDF + 1):  # CJK Compatibility Ideographs
+        #     char = chr(code)
+        #     # if not unicodedata.category(char).startswith('P'):
+        #     file.write(char + '\n')
+
+        for code in range(0xF900, 0xFAD9 + 1):  # CJK Compatibility Ideographs
+            char = chr(code)
+            # if not unicodedata.category(char).startswith('P'):
+            file.write(char + '\n')
+
+
+def map_kangxi_to_common_characters(kangxi_start=0x2F00, kangxi_end=0x2FDF, common_start=0x4E00, common_end=0x9FFF, output_path="kangxi_to_common.txt"):
+    with open(output_path, 'w', encoding='utf-8') as file:
+        # 遍历康熙部首范围
+        for kangxi_code in range(kangxi_start, kangxi_end + 1):
+            kangxi_char = chr(kangxi_code)
+            # 遍历常见中文字符范围
+            for common_code in range(common_start, common_end + 1):
+                common_char = chr(common_code)
+                # 如果字形相同,则记录匹配
+                if kangxi_char == common_char:
+                    file.write(f"{kangxi_char} (Kangxi: {hex(kangxi_code)}) -> {common_char} (Common: {hex(common_code)})\n")
+                    break  # 找到匹配后,跳出内层循环
+
+
+if __name__ == "__main__":
+    # ofd = OFD()
+
+    # p = r'D:\Project\format_conversion_maxcompute\format_convert\temp\2b42e0b44cea11f0ab9644f971944973\2b4307ae4cea11f0992a44f971944973_ofd\Doc_0\Res\19.ttf'
+    # font = TTFont(p)  # 替换为你的TTF文件路径
+    # print('font', font.keys())
+    #
+    # # 访问 GlyphOrder 表
+    # glyph_order = font['glyf']
+    # print("Glyph Order:", glyph_order.glyphs)
+    #
+    # # 访问 head 表
+    # head = font['head']
+    # print("Font Head:")
+    # print(f" - Font Magic Number: {head.magicNumber}")
+    # print(f" - Font Version: {head.fontRevision}")
+    # print(f" - Font Flags: {head.flags}")
+    # print(f" - Units per Em: {head.unitsPerEm}")
+    # print(f" - Created: {head.created}")
+    # print(f" - Modified: {head.modified}")
+    #
+    # # 访问 hhea 表
+    # hhea = font['hhea']
+    # print("Horizontal Header:")
+    # print(f" - Ascent: {hhea.ascent}")
+    # print(f" - Descent: {hhea.descent}")
+    # print(f" - Line Gap: {hhea.lineGap}")
+    #
+    # # 访问 maxp 表
+    # maxp = font['maxp']
+    # print("Maximum Profile:")
+    # print(f" - Number of Glyphs: {maxp.numGlyphs}")
+    #
+    # # 访问 OS/2 表
+    # os2 = font['OS/2']
+    # print("OS/2 and Windows Metrics:")
+    # print(f" - Weight Class: {os2.usWeightClass}")
+    # print(f" - Width Class: {os2.usWidthClass}")
+    # print(f" - Type: {os2.fsType}")
+    #
+    # # 访问 hmtx 表
+    # hmtx = font['hmtx']
+    # print("Horizontal Metrics:")
+    # for glyph_name, metrics in hmtx.metrics.items():
+    #     print(f" - Glyph '{glyph_name}': Advance Width = {metrics[0]}, Left Side Bearing = {metrics[1]}")
+    #
+    # # 访问 loca 表
+    # loca = font.get('loca')
+    # print("Locations:")
+    # for i, location in enumerate(loca):
+    #     print(f" - Glyph {i}: {location}")
+    #
+    # # 访问 glyf 表
+    # glyf = font.get('glyf')
+    # for glyph_name in glyf.glyphs:
+    #     glyph = glyf[glyph_name]
+    #     print(f"Glyph '{glyph_name}':")
+    #     print(f" - Number of Contours: {glyph.numberOfContours}")
+    #     if glyph.numberOfContours > 0:
+    #         print(f" - X Minimum: {glyph.xMin}")
+    #         print(f" - Y Minimum: {glyph.yMin}")
+    #         print(f" - X Maximum: {glyph.xMax}")
+    #         print(f" - Y Maximum: {glyph.yMax}")
+    #     else:
+    #         print(" - No Contours")
+    #     print()
+    #
+    # # 访问 name 表
+    # name = font['name']
+    # print("Font Name Entries:")
+    # for record in name.names:
+    #     print(f" - Name ID: {record.nameID}")
+    #     print(f" - Platform ID: {record.platformID}")
+    #     print(f" - Encoding ID: {record.platEncID}")
+    #     print(f" - Language ID: {record.langID}")
+    #     print(f" - Name: {record.toUnicode()}")
+    #     print()
+    # font.close()
+    # print(best_cmap)
+
+    import unicodedata
+    #
+    # # 示例
+    # text = "仁和坪镇杨柳池村⼈居环境整治项⽬终⽌"
+    # standardized_text = unicodedata.normalize('NFD', text)
+    # print(f"标准化后的文本: {standardized_text}")
+
+    # import unicodedata
+    #
+    #
+    #
+    # similar_characters = find_similar_characters()
+    #
+    # for pair in similar_characters:
+    #     print(f"原始字符: {pair[0]}, 兼容字符: {pair[1]}")
+    #
+    # print(f"共找到 {len(similar_characters)} 对相似中文字符。")
+
+
+    # 使用示例
+    output_path = 'chinese_characters.txt'
+    # save_chinese_characters(output_path)
+
+    # 获取并打印 Unicode 编码
+    # char = '⽬'
+    # # char = '目'
+    # print(f"字符 '{char}' 的 Unicode 编码是: {ord(char):04X}")

+ 37 - 0
format_convert/easyofd/easyofd/parser_ofd/__init__.py

@@ -0,0 +1,37 @@
+import os
+import sys
+
+from loguru import logger
+from reportlab.pdfbase import pdfmetrics
+from reportlab.pdfbase.cidfonts import UnicodeCIDFont
+from reportlab.pdfbase.ttfonts import TTFont
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../../../../")
+
+
+# from ofd_parser import *
+
+
+font_map = {"simsun.ttc":["宋体", "SWPMEH+SimSun","SimSun","SWDKON+SimSun"],
+            'simkai.ttf':["KaiTi","楷体","SWLCQE+KaiTi","SWHGME+KaiTi","BWSimKai"],
+            # 'STKAITI.TTF':["华文楷体 常规","STKAITI","华文楷体"],
+            "COURI.TTF":["CourierNewPSMT","CourierNew","SWCRMF+CourierNewPSMT","SWANVV+CourierNewPSMT"],
+            "courbd.TTF":["Courier New"],
+            "simhei.ttf":["SimHei","hei","黑体"]
+            }
+pdfmetrics.registerFont(UnicodeCIDFont('STSong-Light'))
+
+# 初始化字体
+for font,names in font_map.items():
+    for name in names:
+        try:
+            pdfmetrics.registerFont(TTFont(name, font))
+        except:
+            logger.warning(f"FONT  registerFont failed {font}: {name}")
+
+from format_convert.easyofd.easyofd.parser_ofd.ofd_parser import OFDParser
+__all__=["OFDParser"]
+                                    
+
+
+

+ 145 - 0
format_convert/easyofd/easyofd/parser_ofd/file_annotation_parser.py

@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_annotation_parser.py
+# CREATE_TIME: 2025/3/28 14:12
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: 注释解析
+import re
+
+from loguru import logger
+from .file_parser_base import FileParserBase
+
+
+# class AnnotationsParser(FileParserBase):
+#     """
+#     Parser Annotations
+#     注释信息-总
+#     /xml_dir/Doc_0/Pages/Page_0/Content.xml
+#     """
+#
+#     def __call__(self):
+#         info = {}
+#         annotations_res: list = []
+#         annotations_res_key = "ofd:Page"
+#         self.recursion_ext(self.xml_obj, annotations_res, annotations_res_key)
+#         # logger.debug(f"annotations_res is {annotations_res}")
+#         if annotations_res:
+#             for i in annotations_res:
+#                 page_id = i.get("@PageID")
+#                 if not page_id:
+#                     # logger.debug(f"page_id is null ")
+#                     continue
+#                 file_Loc = i.get("ofd:FileLoc")
+#                 if not file_Loc:
+#                     # logger.debug(f"file_Loc is null ")
+#                     continue
+#                 info[page_id] = {
+#                     "FileLoc": file_Loc,
+#                 }
+#
+#         return info
+#
+#
+# class AnnotationFileParser(FileParserBase):
+#     """
+#     Parser Annotation
+#     注释类 包含 签名注释 水印注释 信息注释
+#     """
+#
+#     AnnoType = {
+#         "Watermark": {
+#             "name": "水印",
+#             "type": "Watermark"
+#         },
+#         "Link": {
+#             "name": "链接",
+#             "type": "Link"
+#         }
+#         ,
+#         "Path": {
+#             "name": "路径",
+#             "type": "Path"
+#         },
+#         "Highlight": {
+#             "name": "高亮",
+#             "type": "Highlight"
+#         },
+#         "Stamp": {
+#             "name": "签章",
+#             "type": "Highlight"
+#         }
+#     }
+#
+#     def normalize_font_name(self, font_name):
+#         """将字体名称规范化,例如 'Times New Roman Bold' -> 'TimesNewRoman-Bold'"""
+#         # 替换空格为无,并将样式(Bold/Italic等)用连字符连接
+#         if not isinstance(font_name, str):
+#             return ""
+#         normalized = font_name.replace(' ', '')
+#         # 处理常见的样式后缀
+#         for style in ['Bold', 'Italic', 'Regular', 'Light', 'Medium', ]:
+#             if style in normalized:
+#                 normalized = normalized.replace(style, f'-{style}')
+#
+#         # todo 特殊字体名规范 后续存在需要完善
+#         if normalized == "TimesNewRoman":
+#             normalized = normalized.replace("TimesNewRoman", "Times-Roman")
+#         return normalized
+#
+#     def __call__(self):
+#         info = {}
+#         public_res: list = []
+#         public_res_key = "ofd:Page"
+#         self.recursion_ext(self.xml_obj, public_res, public_res_key)
+#
+#         if public_res:
+#             for i in public_res:
+#                 info[i.get("@ID")] = {
+#                     "FontName": self.normalize_font_name(i.get("@FontName")),
+#                     "FontNameORI": i.get("@FontName"),
+#                     "FamilyName": self.normalize_font_name(i.get("@FamilyName")),
+#                     "FamilyNameORI": i.get("@FamilyName"),
+#                     "Bold": i.get("@Bold"),
+#                     "Serif": i.get("@Serif"),
+#                     "FixedWidth": i.get("@FixedWidth"),
+#                     "FontFile": i.get("ofd:FontFile"),
+#                 }
+#         return info
+
+
+class AnnotationFileParser(FileParserBase):
+    """
+    Annotations.xml 为doc内的根节点 包含:
+    1 文件的路径
+
+    /xml_dir/Doc_0/Annotations.xml
+    """
+
+    def loc2page_no(self, loc, idx):
+        pg_no = re.search(r"\d+", loc)
+        if pg_no:
+            pg_no = int(pg_no.group())
+        else:
+            pg_no = idx
+        return pg_no
+
+    def __call__(self):
+        annot_info = {}
+
+        # ofd:Page 正文
+        page: list = []
+        page_id_map = {}
+        page_key = "ofd:Page"
+        self.recursion_ext(self.xml_obj, page, page_key)
+        if page:
+            # print('page', page)
+            page_id_map = {
+                i.get("@PageID"): self.loc2page_no(i.get("ofd:FileLoc"), idx)
+                for idx, i in enumerate(page)
+            }
+            page = [i.get("ofd:FileLoc") if isinstance(i, dict) else i for i in page]
+
+        annot_info["annot_page"] = page
+        annot_info["annot_page_id_map"] = page_id_map
+        return annot_info

+ 7 - 0
format_convert/easyofd/easyofd/parser_ofd/file_attachment_parser.py

@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_attachment_parser.py
+# CREATE_TIME: 2025/4/9 18:52
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE:

+ 140 - 0
format_convert/easyofd/easyofd/parser_ofd/file_content_parser.py

@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_content_parser.py
+# CREATE_TIME: 2025/3/28 11:47
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: 解析正文
+from loguru import  logger
+from .file_parser_base import FileParserBase
+
+
+class ContentFileParser(FileParserBase):
+    """
+    Parser Contents&tpls
+    /xml_dir/Doc_0/Doc_0/Pages/Page_0/Content.xml
+    """
+
+    def fetch_cell_info(self, row, TextObject):
+        """fetch_cell_info"""
+        cell_d = {}
+        cell_d = {}
+        cell_d["ID"] = row['@ID']  # 字体
+        # 字体字形信息
+        if row.get("ofd:CGTransform"):
+            Glyphs_d = {
+                "Glyphs": row.get("ofd:CGTransform").get("ofd:Glyphs"),
+                "GlyphCount": row.get("ofd:CGTransform").get("@GlyphCount"),
+                "CodeCount": row.get("ofd:CGTransform").get("@CodeCount"),
+                "CodePosition": row.get("ofd:CGTransform").get("@CodePosition")
+            }
+            cell_d["Glyphs_d"] = Glyphs_d
+
+        cell_d["pos"] = [float(pos_i) for pos_i in row['@Boundary'].split(" ")]  # 文本框
+        if row.get('ofd:Clips', {}).get('ofd:Clip', {}).get('ofd:Area', {}).get('ofd:Path', {}):
+            try:
+                cell_d["clips_pos"] = [float(pos_i) for pos_i in
+                                       row.get('ofd:Clips', {})
+                                           .get('ofd:Clip', {})
+                                           .get('ofd:Area', {})
+                                           .get('ofd:Path', {})
+                                           .get('@Boundary', "")
+                                           .split(" ")]
+            except:
+                pass
+        cell_d["text"] = str(TextObject.get('#text'))
+        cell_d["font"] = row['@Font']  # 字体
+        cell_d["size"] = float(row['@Size'])  # 字号
+        # print("row", row)
+
+        color = self.ofd_param("ofd:FillColor", row).get("@Value", "0 0 0")
+
+        cell_d["color"] = tuple(color.split(" "))  # 颜色
+        cell_d["DeltaY"] = TextObject.get("@DeltaY", "")  # y 轴偏移量 竖版文字表示方法之一
+        cell_d["DeltaX"] = TextObject.get("@DeltaX", "")  # x 轴偏移量
+        cell_d["CTM"] = row.get("@CTM", "")  # 平移矩阵换
+
+        cell_d["X"] = TextObject.get("@X", "")  # X 文本之与文本框距离
+        cell_d["Y"] = TextObject.get("@Y", "")  # Y 文本之与文本框距离
+        return cell_d
+
+    def __call__(self) -> list:
+        """
+
+        输出主体坐标和文字信息 cell_list
+        [{"pos":row['@Boundary'].split(" "),
+                    "text":row['ofd:TextCode'].get('#text'),
+                    "font":row['@Font'],
+                    "size":row['@Size'],}]
+        """
+        text_list = []
+        img_list = []
+        line_list = []
+
+        content_d = {
+            "text_list": text_list,
+            "img_list": img_list,
+            "line_list": line_list,
+        }
+
+        text: list = []  # 正文
+        text_key = "ofd:TextObject"
+        self.recursion_ext(self.xml_obj, text, text_key)
+
+        if text:
+            for row in text:
+                # print("row", row.get('ofd:TextCode', {}))
+                if isinstance(row.get('ofd:TextCode', {}), list):
+                    for _i in row.get('ofd:TextCode', {}):
+                        if not _i.get('#text'):
+                            continue
+                        cell_d = self.fetch_cell_info(row, _i)
+                        text_list.append(cell_d)
+
+                elif isinstance(row.get('ofd:TextCode', {}), dict):
+                    if not row.get('ofd:TextCode', {}).get('#text'):
+                        continue
+                    cell_d = self.fetch_cell_info(row, row.get('ofd:TextCode', {}))
+                    text_list.append(cell_d)
+
+                else:
+                    logger.error(f"'ofd:TextCode' format nonsupport  {row.get('ofd:TextCode', {})}")
+                    continue
+
+        line: list = []  # 路径线条
+        line_key = "ofd:PathObject"
+        self.recursion_ext(self.xml_obj, line, line_key)
+
+        if line:
+            # print(line)
+            for _i in line:
+                line_d = {}
+                # print("line",_i)
+                try:
+                    line_d["ID"] = _i.get("@ID", "")  # 图片id
+                    line_d["pos"] = [float(pos_i) for pos_i in _i['@Boundary'].split(" ")]  # 平移矩阵换
+                    line_d["LineWidth"] = _i.get("@LineWidth", "")  # 图片id
+                    line_d["AbbreviatedData"] = _i.get("ofd:AbbreviatedData", "")  # 路径指令
+                    line_d["FillColor"] = self.ofd_param("ofd:FillColor", _i).get('@Value', "0 0 0").split(" ")  # 颜色
+                    line_d["StrokeColor"] = self.ofd_param("ofd:StrokeColor", _i).get('@Value', "0 0 0")  # 颜色
+                    line_d["CTM"] = _i.get("@CTM", "")  # 平移矩阵换
+                except KeyError as e:
+                    logger.error(f"{e} \n line is {_i} \n")
+                    continue
+                line_list.append(line_d)
+
+        img: list = []  # 图片
+        img_key = "ofd:ImageObject"
+        self.recursion_ext(self.xml_obj, img, img_key)
+
+        if img:
+            for _i in img:
+                img_d = {}
+                img_d["CTM"] = _i.get("@CTM", "")  # 平移矩阵换
+                img_d["ID"] = _i.get("ID", "")  # 图片id
+                img_d["ResourceID"] = _i.get("@ResourceID", "")  # 图片id
+                img_d["pos"] = [float(pos_i) for pos_i in _i['@Boundary'].split(" ")]  # 平移矩阵换
+                img_list.append(img_d)
+
+        return content_d
+

+ 7 - 0
format_convert/easyofd/easyofd/parser_ofd/file_customtag_parser.py

@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_customtag_parser.py
+# CREATE_TIME: 2025/4/9 18:51
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE:

+ 104 - 0
format_convert/easyofd/easyofd/parser_ofd/file_deal.py

@@ -0,0 +1,104 @@
+# coding: utf-8
+#!/usr/bin/env python
+#-*- coding: utf-8 -*-
+#PROJECT_NAME: D:\code\easyofd\easyofd\parser
+#CREATE_TIME: 2023-07-27 
+#E_MAIL: renoyuan@foxmail.com
+#AUTHOR: reno 
+#NOTE:  文件处理
+import os
+import base64
+import shutil
+from typing import Any
+from uuid import uuid1
+
+import xmltodict
+import zipfile
+from loguru import logger
+
+from .path_parser import PathParser
+
+
+class FileRead(object):
+    """
+    文件读取,清除
+    'root': OFD.xml 
+    "root_doc" Doc_0/Document.xml
+    xml_path : xml_obj
+    other_path : b64string
+    """
+    def __init__(self, ofdb64:str):
+
+        self.ofdbyte = base64.b64decode(ofdb64) 
+        pid=os.getpid()
+        self.name = f"{pid}_{str(uuid1())}.ofd"
+        self.pdf_name = self.name.replace(".ofd",".pdf")
+        self.zip_path = f"{os.getcwd()}/{self.name}"
+        self.unzip_path = ""
+        self.file_tree = {}
+    
+    def unzip_file(self, unzip_dir=None):
+        """
+        :param zip_path: ofd格式文件路径
+        :param unzip_path: 解压后的文件存放目录
+        :return: unzip_path
+        """
+        if unzip_dir is None:
+            self.unzip_path = self.zip_path.split('.')[0]
+            self.zip_path = f"{os.getcwd()}/{self.name}"
+        else:
+            self.unzip_path = unzip_dir
+            self.zip_path = f"{unzip_dir}{self.name}"
+        print('ofd self.unzip_path', self.unzip_path)
+        print('ofd self.zip_path', self.zip_path)
+
+        with open(self.zip_path,"wb") as f:
+            f.write(self.ofdbyte)
+
+        with zipfile.ZipFile(self.zip_path, 'r') as f:
+            for file in f.namelist():
+                # print('file', file)
+                # 跳过附件,在显示中不展示
+                if 'Attachs' in file:
+                    continue
+                f.extract(file, path=self.unzip_path)
+        if self.save_xml:
+            print("saving xml {}".format(self.xml_name))
+            with zipfile.ZipFile(self.zip_path, 'r') as f:
+                for file in f.namelist():
+                    f.extract(file, path=self.xml_name)
+       
+    def buld_file_tree(self):
+        "xml读取对象其他b64"
+        self.file_tree["root"] = self.unzip_path
+        self.file_tree["pdf_name"] = self.pdf_name
+        for root, dirs, files in os.walk(self.unzip_path):
+            for file in files:
+                
+                abs_path = os.path.join(root,file)
+                # 资源文件 则 b64 xml 则  xml——obj
+                self.file_tree[abs_path] = str(base64.b64encode(open(f"{abs_path}","rb").read()),"utf-8")  \
+                    if "xml" not in file else xmltodict.parse(open(f"{abs_path}" , "r", encoding="utf-8").read())
+        self.file_tree["root_doc"] = os.path.join(self.unzip_path,"OFD.xml") if os.path.join(self.unzip_path,"OFD.xml") in self.file_tree else ""
+  
+        # if os.path.exists(self.unzip_path):
+        #     shutil.rmtree(self.unzip_path)
+       
+        # if os.path.exists(self.zip_path):
+        #     os.remove(self.zip_path)
+                   
+    def __call__(self, *args: Any, **kwds: Any) -> Any:
+        self.save_xml=kwds.get("save_xml",False)
+        self.xml_name=kwds.get("xml_name")
+        self.save_dir = kwds.get('save_dir')
+    
+        self.unzip_file(self.save_dir)
+        self.buld_file_tree()
+        return self.file_tree 
+
+
+if __name__ == "__main__":
+    with open(r"D:/code/easyofd/test/增值税电子专票5.ofd","rb") as f:
+        ofdb64 = str(base64.b64encode(f.read()),"utf-8")
+    a = FileRead(ofdb64)()
+    print(list(a.keys()))

+ 99 - 0
format_convert/easyofd/easyofd/parser_ofd/file_doc_parser.py

@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_doc_parser.py
+# CREATE_TIME: 2025/3/28 11:46
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: 解析document
+
+import  re
+
+from .file_parser_base import FileParserBase
+
+
+
+class DocumentFileParser(FileParserBase):
+    """
+    Document 为doc内的根节点 包含:
+    1 文件的路径 2 doc的size
+
+    /xml_dir/Doc_0/Document.xml
+    """
+
+    def loc2page_no(self, loc, idx):
+        pg_no = re.search(r"\d+", loc)
+        if pg_no:
+            pg_no = int(pg_no.group())
+        else:
+            pg_no = idx
+        return pg_no
+
+    def __call__(self):
+        document_info = {}
+
+        # size
+        physical_box: list = []
+        physical_box_key = "ofd:PhysicalBox"
+        self.recursion_ext(self.xml_obj, physical_box, physical_box_key)
+        document_info["size"] = physical_box[0] if physical_box else ""
+
+        # ofd:PublicRes路径 包含字体路径信息
+        public_res: list = []
+        public_res_key = "ofd:PublicRes"
+        self.recursion_ext(self.xml_obj, public_res, public_res_key)
+        document_info["public_res"] = public_res
+
+        # ofd:DocumentRes路径  包含静态资源图片
+        document_res: list = []
+        document_res_key = "ofd:DocumentRes"
+        self.recursion_ext(self.xml_obj, document_res, document_res_key)
+        document_info["document_res"] = document_res
+
+        # tpls
+        tpls: list = []
+        template_page_key = "ofd:TemplatePage"
+        self.recursion_ext(self.xml_obj, tpls, template_page_key)
+        if tpls:
+            tpls = [i.get("@BaseLoc") if isinstance(i, dict) else i for i in tpls]
+        document_info["tpls"] = tpls
+
+        # ofd:Page 正文
+        page: list = []
+        page_id_map = {}
+        page_key = "ofd:Page"
+        self.recursion_ext(self.xml_obj, page, page_key)
+        if page:
+            page_id_map = {
+                i.get("@ID"): self.loc2page_no(i.get("@BaseLoc"), idx)
+                for idx, i in enumerate(page)
+            }
+            page = [i.get("@BaseLoc") if isinstance(i, dict) else i for i in page]
+
+        document_info["page"] = page
+        document_info["page_id_map"] = page_id_map
+
+        # ofd:Annotations
+        annotations: list = []
+        annotations_key = "ofd:Annotations"
+        self.recursion_ext(self.xml_obj, annotations, annotations_key)
+        document_info["Annotations"] = annotations
+
+        # ofd:Attachments
+        attachments: list = []
+        attachments_key = "ofd:Attachments"
+        self.recursion_ext(self.xml_obj, attachments, attachments_key)
+        document_info["attachments"] = attachments
+
+        # ofd:CustomTags
+        custom_tag: list = []
+        custom_tag_key = "ofd:CustomTags"
+        self.recursion_ext(self.xml_obj, custom_tag, custom_tag_key)
+        document_info["custom_tag"] = custom_tag
+
+        return document_info
+
+
+
+
+
+

+ 36 - 0
format_convert/easyofd/easyofd/parser_ofd/file_docres_parser.py

@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_docres_parser.py
+# CREATE_TIME: 2025/3/28 11:48
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: 解析 DocumentRes
+
+import os
+
+from .file_parser_base import FileParserBase
+
+class DocumentResFileParser(FileParserBase):
+    """
+    Parser DocumentRes 抽取里面图片信息
+    /xml_dir/Doc_0/DocumentRes.xml
+    /xml_dir/Doc_0/PublicRes.xml
+    """
+
+    def __call__(self):
+        info = {}
+        muti_media: list = []
+        muti_media_key = "ofd:MultiMedia"
+        self.recursion_ext(self.xml_obj, muti_media, muti_media_key)
+        if muti_media:
+            for media in muti_media:
+                name = media.get("ofd:MediaFile", "")
+                info[media.get("@ID")] = {
+                    "format": media.get("@Format", ""),
+                    "wrap_pos": media.get("@wrap_pos", ""),
+                    # "Boundary": media.get("@Boundary", ""),
+                    "type": media.get("@Type", ""),
+                    "suffix": os.path.splitext(name)[-1].replace(".", ""),  # 文件后缀名
+                    "fileName": name,
+                }
+        return info

+ 41 - 0
format_convert/easyofd/easyofd/parser_ofd/file_ofd_parser.py

@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_ofd_parser.py
+# CREATE_TIME: 2025/3/28 11:45
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: 解析OFD
+from .file_parser_base import FileParserBase
+
+class OFDFileParser(FileParserBase):
+    """
+    Parser OFD 文件
+    /xml_dir/OFD.xml
+    """
+    def __call__(self):
+        info = {}
+        # DocRoot
+        doc_root: list = []
+        doc_root_key = "ofd:DocRoot"
+        # print(self.xml_obj,doc_root)
+        self.recursion_ext(self.xml_obj, doc_root, doc_root_key)
+        info["doc_root"] = doc_root
+
+        signatures: list = []
+        signatures_key = "ofd:Signatures"
+        self.recursion_ext(self.xml_obj, signatures, signatures_key)
+        info["signatures"] = signatures
+
+        # ofd:Creator
+        creator: list = []
+        creator_key = "ofd:Creator"
+        self.recursion_ext(self.xml_obj, creator, creator_key)
+        info["creator"] = creator
+
+        # ofd:CreationDate
+        reation_date: list = []
+        creation_date_key = "ofd:CreationDate"
+        self.recursion_ext(self.xml_obj, reation_date, creation_date_key)
+        info["creationDate"] = reation_date
+
+        return info

+ 58 - 0
format_convert/easyofd/easyofd/parser_ofd/file_parser.py

@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: D:\code\easyofd\easyofd\parser
+# CREATE_TIME: 2023-07-27
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: 每种类型的文件定义一个解析器
+
+import sys
+
+sys.path.insert(0, "..")
+import logging
+import os
+import traceback
+import base64
+import re
+from typing import Any
+from .parameter_parser import ParameterParser
+logger = logging.getLogger("root")
+
+
+class FileParserBase(object):
+    """xml解析"""
+
+    def __init__(self, xml_obj):
+        assert xml_obj
+        self.ofd_param = ParameterParser()
+        self.xml_obj = xml_obj
+        # print(xml_obj)
+
+    def recursion_ext(self, need_ext_obj, ext_list, key):
+        """
+        抽取需要xml要素
+        need_ext_obj : xmltree
+        ext_list: data container
+        key: key
+        """
+        if isinstance(need_ext_obj, dict):
+            for k, v in need_ext_obj.items():
+                if k == key:
+                    if isinstance(v, (dict, str)):
+                        ext_list.append(v)
+                    elif isinstance(v, list):
+                        ext_list.extend(v)
+                else:
+                    if isinstance(v, dict):
+                        self.recursion_ext(v, ext_list, key)
+                    elif isinstance(v, list):
+                        for cell in v:
+                            self.recursion_ext(cell, ext_list, key)
+                    else:
+                        pass
+        else:
+            print(type(need_ext_obj))
+
+
+if __name__ == "__main__":
+    FileParserBase("")()

+ 63 - 0
format_convert/easyofd/easyofd/parser_ofd/file_parser_base.py

@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_parser_base.py
+# CREATE_TIME: 2025/3/28 11:43
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: base 解析器
+
+import sys
+
+sys.path.insert(0, "..")
+import logging
+import os
+import traceback
+import base64
+import re
+from typing import Any
+from .parameter_parser import ParameterParser
+logger = logging.getLogger("root")
+
+
+class FileParserBase(object):
+    """xml解析"""
+
+    def __init__(self, xml_obj):
+        assert xml_obj
+        self.ofd_param = ParameterParser()
+        self.xml_obj = xml_obj
+        # print(xml_obj)
+
+    def recursion_ext(self, need_ext_obj, ext_list, key):
+        """
+        抽取需要xml要素
+        need_ext_obj : xmltree
+        ext_list: data container
+        key: key
+        """
+
+        if isinstance(need_ext_obj, dict):
+
+            for k, v in need_ext_obj.items():
+                if k == key:
+
+                    if isinstance(v, (dict, str)):
+                        ext_list.append(v)
+                    elif isinstance(v, list):
+                        ext_list.extend(v)
+
+
+                else:
+
+                    if isinstance(v, dict):
+                        self.recursion_ext(v, ext_list, key)
+                    elif isinstance(v, list):
+                        for cell in v:
+                            self.recursion_ext(cell, ext_list, key)
+                    else:
+
+                        pass
+        else:
+
+            print(type(need_ext_obj))
+

+ 52 - 0
format_convert/easyofd/easyofd/parser_ofd/file_publicres_parser.py

@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_publicres_parser.py
+# CREATE_TIME: 2025/3/28 11:49
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: PublicResFileParser
+
+from .file_parser_base import FileParserBase
+
+
+class PublicResFileParser(FileParserBase):
+    """
+    Parser PublicRes 抽取里面 获取公共信息 字体信息
+    /xml_dir/Doc_0/PublicRes.xml
+    """
+
+    def normalize_font_name(self, font_name):
+        """将字体名称规范化,例如 'Times New Roman Bold' -> 'TimesNewRoman-Bold'"""
+        # 替换空格为无,并将样式(Bold/Italic等)用连字符连接
+        if not isinstance(font_name, str):
+            return ""
+        normalized = font_name.replace(' ', '')
+        # 处理常见的样式后缀
+        for style in ['Bold', 'Italic', 'Regular', 'Light', 'Medium', ]:
+            if style in normalized:
+                normalized = normalized.replace(style, f'-{style}')
+
+        # todo 特殊字体名规范 后续存在需要完善
+        if normalized == "TimesNewRoman":
+            normalized = normalized.replace("TimesNewRoman", "Times-Roman")
+        return normalized
+
+    def __call__(self):
+        info = {}
+        public_res: list = []
+        public_res_key = "ofd:Font"
+        self.recursion_ext(self.xml_obj, public_res, public_res_key)
+
+        if public_res:
+            for i in public_res:
+                info[i.get("@ID")] = {
+                    "FontName": self.normalize_font_name(i.get("@FontName")),
+                    "FontNameORI": i.get("@FontName"),
+                    "FamilyName": self.normalize_font_name(i.get("@FamilyName")),
+                    "FamilyNameORI": i.get("@FamilyName"),
+                    "Bold": i.get("@Bold"),
+                    "Serif": i.get("@Serif"),
+                    "FixedWidth": i.get("@FixedWidth"),
+                    "FontFile": i.get("ofd:FontFile"),
+                }
+        return info

+ 63 - 0
format_convert/easyofd/easyofd/parser_ofd/file_signature_parser.py

@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_signature_parser.py
+# CREATE_TIME: 2025/3/28 14:13
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: 签章解析
+
+from .file_parser_base import FileParserBase
+
+class SignaturesFileParser(FileParserBase):
+    """
+    Parser Signatures
+    签章信息-总
+    /xml_dir/Doc_0/PublicRes.xml
+    """
+
+    def __call__(self):
+        info = {}
+        signature_res: list = []
+        signature_res_key = "ofd:Signature"
+        self.recursion_ext(self.xml_obj, signature_res, signature_res_key)
+
+        if signature_res:
+            for i in signature_res:
+                info[i.get("@ID")] = {
+                    "BaseLoc": i.get("@BaseLoc"),
+                    "Type": i.get("@Type"),
+                    "ID": i.get("@ID"),
+
+                }
+        return info
+
+
+class SignatureFileParser(FileParserBase):
+    """
+    Parser Signature
+    签章信息
+    """
+
+    def __call__(self, prefix=""):
+        info = {}
+        StampAnnot_res: list = []
+        StampAnnot_res_key = "ofd:StampAnnot"
+
+        self.recursion_ext(self.xml_obj, StampAnnot_res, StampAnnot_res_key)
+
+        SignedValue_res: list = []
+        SignedValue_res_key = "ofd:SignedValue"
+        self.recursion_ext(self.xml_obj, SignedValue_res, SignedValue_res_key)
+
+        # print("SignedValue_res", SignedValue_res)
+        # print("prefix", prefix)
+        if StampAnnot_res:
+            for i in StampAnnot_res:
+                info = {
+                    "PageRef": i.get("@PageRef"),  # page id
+                    "Boundary": i.get("@Boundary"),
+                    "ID": i.get("@ID"),
+                    "SignedValue": f"{prefix}/{SignedValue_res[0]}" if SignedValue_res else f"{prefix}/SignedValue.dat",
+                }
+
+        return info

+ 100 - 0
format_convert/easyofd/easyofd/parser_ofd/find_seal_img.py

@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: easyofd read_seal_img
+# CREATE_TIME: 2024/5/28 14:13
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: renoyuan
+# note: 根据 ASN.1 解析签章 拿到 签章图片
+import io
+
+from PIL import Image, UnidentifiedImageError
+from loguru import logger
+from pyasn1.codec.der.decoder import decode
+from pyasn1.type import univ
+from pyasn1.error import PyAsn1Error
+
+
+
+class SealExtract(object):
+    def __init__(self,):
+        pass
+    def read_signed_value(self, path):
+        # 读取二进制文件
+        with open(path, 'rb') as file:
+            binary_data = file.read()
+        # 尝试解码为通用的 ASN.1 结构
+        try:
+            decoded_data, _ = decode(binary_data)
+        except PyAsn1Error as e:
+            # print(f"Decoding failed: {e}")
+            decoded_data = None
+        finally:
+           return  decoded_data
+
+
+    def find_octet_strings(self, asn1_data,octet_strings:list):
+
+        # 递归查找所有的 OctetString 实例
+
+        if isinstance(asn1_data, univ.OctetString):
+
+            octet_strings.append(asn1_data)
+        elif isinstance(asn1_data, univ.Sequence) or isinstance(asn1_data, univ.Set):
+            for component in asn1_data:
+                self.find_octet_strings(asn1_data[f"{component}"], octet_strings)
+        elif isinstance(asn1_data, univ.Choice):
+            self.find_octet_strings(asn1_data.getComponent(), octet_strings)
+        elif isinstance(asn1_data, univ.Any):
+            try:
+                sub_data, _ = decode(asn1_data.asOctets())
+                self.find_octet_strings(sub_data, octet_strings)
+            except PyAsn1Error:
+                pass
+
+
+    def hex_to_image(self, hex_data, image_format='PNG',inx=0):
+        """
+        将16进制数据转换为图片并保存。
+
+        :param hex_data: 图片的16进制数据字符串
+        :param image_format: 图片的格式,默认为'PNG'
+        """
+        # 将16进制数据转换为二进制数据
+
+        binary_data = bytes.fromhex(hex_data)
+
+        # 创建BytesIO对象以读取二进制数据
+        image_stream = io.BytesIO(binary_data)
+
+        # 使用Pillow打开图像数据并保存
+        try:
+            image = Image.open(image_stream)
+            # image.save(f'{inx}_image.{image_format}', format=image_format)
+            # print(f"图片已保存为'image.{image_format}'")
+            return image
+        except UnidentifiedImageError:
+            pass
+            # logger.info("not img ")
+
+    def __call__(self, path):
+        decoded_data = self.read_signed_value(path)
+        octet_strings = []
+        img_list = []  # 目前是只有一个的,若存在多个的话关联后面考虑
+        if decoded_data:
+            self.find_octet_strings(decoded_data, octet_strings)
+
+            for i, octet_string in enumerate(octet_strings):
+
+                if str(octet_string.prettyPrint()).startswith("0x"):
+
+                    img = self.hex_to_image(str(octet_string.prettyPrint())[2:],inx= i)
+                    if img:
+                        img_list.append(img)
+        else:
+            pass
+            # logger.info("No valid ASN.1 data found.")
+        return  img_list
+
+if __name__=="__main__":
+    print(SealExtract()(r"F:\code\easyofd\test\1111_xml\Doc_0\Signs\Sign_0\SignedValue.dat" ))
+

+ 35 - 0
format_convert/easyofd/easyofd/parser_ofd/img_deal.py

@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: easyofd img_deal
+# CREATE_TIME: 2024/7/18 11:20
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: renoyuan
+# note: img 操作
+from io import BytesIO
+class DealImg(object):
+    def __init__(self):
+        pass
+    def resize(self):
+        """resize img"""
+        pass
+    def pil2bytes(self, image):
+        """pil2bytes"""
+        # 创建一个 BytesIO 对象
+        img_bytesio = BytesIO()
+        # 将图像保存到 BytesIO 对象
+        image.save(img_bytesio, format='PNG')  # 你可以根据需要选择其他图像格式
+        # 获取 BytesIO 对象中的字节
+        img_bytes = img_bytesio.getvalue()
+        # 关闭 BytesIO 对象
+        img_bytesio.close()
+        return img_bytes
+    def pil2bytes_io(self, image):
+        """pil2bytes_io"""
+        # 创建一个 BytesIO 对象
+        img_bytesio = BytesIO()
+        # 将图像保存到 BytesIO 对象
+        image.save(img_bytesio, format='PNG')  # 你可以根据需要选择其他图像格式
+        return img_bytesio
+
+
+

+ 607 - 0
format_convert/easyofd/easyofd/parser_ofd/ofd_parser.py

@@ -0,0 +1,607 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: D:\code\easyofd\easyofd\parser
+# CREATE_TIME: 2023-07-27
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: ofd解析主流程
+
+import os
+import sys
+sys.path.append(os.path.dirname(__file__) + "/../../../../")
+from format_convert.easyofd.easyofd.parser_ofd.file_ofd_parser import OFDFileParser
+from jbig2_parser import jbig2_parser
+import traceback
+import base64
+import re
+import io
+# import jbigkit
+from typing import Any, List
+from PIL import Image
+from PIL.Image import Image as ImageClass
+from loguru import logger
+
+from format_convert.easyofd.easyofd.parser_ofd.img_deal import DealImg
+from format_convert.easyofd.easyofd.parser_ofd.file_deal import FileRead
+from format_convert.easyofd.easyofd.parser_ofd.file_ofd_parser import OFDFileParser
+from format_convert.easyofd.easyofd.parser_ofd.file_doc_parser import DocumentFileParser
+from format_convert.easyofd.easyofd.parser_ofd.file_docres_parser import DocumentResFileParser
+from format_convert.easyofd.easyofd.parser_ofd.file_content_parser import ContentFileParser
+from format_convert.easyofd.easyofd.parser_ofd.file_annotation_parser import AnnotationFileParser
+from format_convert.easyofd.easyofd.parser_ofd.file_publicres_parser import PublicResFileParser
+from format_convert.easyofd.easyofd.parser_ofd.file_signature_parser import SignaturesFileParser,SignatureFileParser
+from format_convert.easyofd.easyofd.parser_ofd.path_parser import PathParser
+# todo 解析流程需要大改
+
+
+class OFDParser(object):
+    """
+    OFDParser 解析
+    1 解压文件 创建文件映射表 释放文件
+    2 解析 xml 逐级去 收集需要信息  结构文本 以及 资源
+    2 调用font 注册 字体
+
+    图层顺序 tlp>content>annotation
+    """
+
+    def __init__(self, ofdb64):
+        self.img_deal = DealImg()
+        self.ofdb64 = ofdb64
+        self.file_tree = None
+        # self.jbig2dec_path = r"C:/msys64/mingw64/bin/jbig2dec.exe"
+        self.jbig2dec_path = r'D:\Anaconda3\pkgs\jbig2dec-0.18-ha9979f8_0\Library\bin\jbig2dec.exe'
+
+    def img2data(self, imglist: List[ImageClass]):
+        """
+        imglist to ofd data
+        
+        """
+        OP = 200 / 25.4
+        doc_list = []
+        img_info = {}
+        page_size = []
+        font_info = {}
+        page_info_d = {}
+
+        for idx, img_pil in enumerate(imglist):
+            w, h = img_pil.size
+            img_bytes = self.img_deal.pil2bytes(img_pil)
+            imgb64 = str(base64.b64encode(img_bytes), encoding="utf-8")
+            img_info[str(idx)] = {
+                "format": "jpg",
+                "wrap_pos": "",
+                "type": "IMG",
+                "suffix": "jpg",
+                "fileName": f"{idx}.jpg",
+                "imgb64": imgb64,
+
+            }
+            text_list = []
+            img_list = []
+            img_d = {}
+            img_d["CTM"] = ""  # 平移矩阵换 平移 缩放 旋转
+            img_d["ID"] = str(idx)  # 图片id
+            img_d["ResourceID"] = str(idx)  # 图片id
+            img_d["pos"] = [0, 0, w / OP, h / OP]  # 平移矩阵换
+            page_size = [0, 0, w / OP, h / OP]
+            # print(page_size)
+            img_list.append(img_d)
+
+            content_d = {
+                "text_list": text_list,
+                "img_list": img_list,
+            }
+            page_info_d[idx] = content_d
+        doc_list.append({
+            "pdf_name": "demo.pdf",
+            "doc_no": "0",
+            "images": img_info,
+            "page_size": page_size,
+            "fonts": font_info,
+            "page_info": page_info_d
+        })
+
+        return doc_list
+
+    # 获得xml 对象
+    def get_xml_obj(self, label):
+        assert label
+        # print(self.file_tree.keys())
+        label =label.lstrip('./')
+        for abs_p in self.file_tree:
+            # 统一符号,避免win linux 路径冲突
+
+            abs_p_compare = abs_p.replace("\\\\", "-").replace("//", "-").replace("\\", "-").replace("/", "-")
+            label_compare = label.replace("\\\\", "-").replace("//", "-").replace("\\", "-").replace("/", "-")
+            if label_compare in abs_p_compare:
+                # logger.info(f"{label} {abs_p}")
+                return self.file_tree[abs_p]
+        # logger.info(f"{label} ofd file path is not")
+        return ""
+
+    def jb22png_old(self, img_d: dict):
+        """
+        jb22png
+        没有安装 jbig2dec 无法操作 
+        """
+        if not os.path.exists(self.jbig2dec_path):
+            logger.warning(f"未安装jbig2dec,无法处理jb2文件")
+            return
+
+        # todo ib2 转png C:/msys64/mingw64/bin/jbig2dec.exe -o F:\code\easyofd\test\image_80.png F:\code\easyofd\test\image_80.jb2
+        fileName = img_d["fileName"]
+        print('jb2 file_name', fileName)
+        new_fileName = img_d['fileName'].replace(".jb2", ".png")
+        with open(fileName, "wb") as f:
+            f.write(base64.b64decode(img_d["imgb64"]))
+        command = "{} -o {} {}"
+        res = os.system(command.format(self.jbig2dec_path, new_fileName, fileName))
+        if res != 0:
+            pass
+            # logger.warning(f"jbig2dec处理失败")
+        # if os.path.exists(fileName):
+        #     os.remove(fileName)
+        if os.path.exists(new_fileName):
+            # logger.info(f"jbig2dec处理成功{fileName}>>{new_fileName}")
+            img_d["fileName"] = new_fileName
+            img_d["suffix"] = "png"
+            img_d["format"] = "png"
+            with open(new_fileName, "rb") as f:
+                data = f.read()
+                img_d["imgb64"] = str(base64.b64encode(data), encoding="utf-8")
+
+            # os.remove(new_fileName)
+
+    def jb22png(self, img_d: dict):
+        """
+        jb22png
+        没有安装 jbig2dec 无法操作
+        """
+
+        file_name = img_d["fileName"]
+        # print('jb2 file_name', file_name)
+        new_file_name = img_d['fileName'].replace(".jb2", ".png")
+        with open(file_name, "rb") as f:
+            data = f.read()
+        png_data = jbig2_parser.parse_jbig2(data)
+        png_bytes = bytes(png_data)
+        # print('png_data', png_data)
+
+        # # 将字节缓冲区转换为图像对象
+        # image = Image.open(io.BytesIO(png_data))
+        #
+        # # 保存图像为 PNG 文件
+        # image.save(new_file_name, 'PNG')
+
+        with open(new_file_name, 'wb') as f:
+            f.write(png_bytes)
+
+        if os.path.exists(new_file_name):
+            # logger.info(f"jbig2dec处理成功{fileName}>>{new_fileName}")
+            img_d["fileName"] = new_file_name
+            img_d["suffix"] = "png"
+            img_d["format"] = "png"
+            with open(new_file_name, "rb") as f:
+                data = f.read()
+                img_d["imgb64"] = str(base64.b64encode(data), encoding="utf-8")
+
+        # decoder = jbigkit.JbgDecoder()
+        # with open(file_name, "rb") as f:
+        #     data = f.read()
+        # status, processed_len = decoder.decode_in(data)
+        # if status != jbigkit.JbgErrno.EOK or processed_len != len(data):
+        #     print('jb2 file error!')
+        #     return
+        # assert status == jbigkit.JbgErrno.EOK
+        # assert processed_len == len(data)
+        #
+        # w, h = decoder.get_width(), decoder.get_height()
+        #
+        # ith_plane = decoder.get_plane(0)  # 获取第一个平面
+        # img = Image.frombytes('1', (w, h), bytes(ith_plane), 'raw', '1;I')
+        # img.save(new_file_name)
+
+        # os.remove(new_fileName)
+
+    def bmp2jpg(self, img_d: dict):
+
+        fileName = img_d["fileName"]
+        new_fileName = img_d['fileName'].replace(".bmp", ".jpg")
+        b64_nmp = self.get_xml_obj(fileName)
+        image_data = base64.b64decode(b64_nmp)
+        image = Image.open(io.BytesIO(image_data))
+        rgb_image = image.convert("RGB")
+        output_buffer = io.BytesIO()
+        rgb_image.save(output_buffer, format="JPEG")
+        image.close()
+        jpeg_bytes = output_buffer.getvalue()
+        b64_jpeg = base64.b64encode(jpeg_bytes).decode('utf-8')
+        output_buffer.close()
+
+        if b64_jpeg:
+            logger.info(f"bmp2jpg处理成功{fileName}>>{new_fileName}")
+            img_d["fileName"] = new_fileName
+            img_d["suffix"] = "jpg"
+            img_d["format"] = "jpg"
+            img_d["imgb64"] = b64_jpeg
+
+    def tif2jpg(self, img_d: dict):
+        fileName = img_d["fileName"]
+        new_fileName = img_d['fileName'].replace(".tif", ".jpg")
+        tif_nmp = self.get_xml_obj(fileName)
+        image_data = base64.b64decode(tif_nmp)
+        image = Image.open(io.BytesIO(image_data))
+        if image.mode in ("RGBA", "LA") or (image.mode == "P" and "transparency" in image.info):
+            image = image.convert("RGB")
+
+            # 创建一个字节流来保存处理后的图像
+        output_buffer = io.BytesIO()
+
+        # 保存图像为 JPEG 格式到字节流中
+        image.save(output_buffer, format="JPEG", quality=95)
+
+        # 获取字节流中的内容并编码为 Base64 字符串
+        jpeg_bytes = output_buffer.getvalue()
+        b64_jpeg = base64.b64encode(jpeg_bytes).decode('utf-8')
+
+        # 关闭图像对象和字节流
+        image.close()
+        output_buffer.close()
+
+        if b64_jpeg:
+            logger.info(f"tif2jpg处理成功{fileName}>>{new_fileName}")
+            img_d["fileName"] = new_fileName
+            img_d["suffix"] = "jpg"
+            img_d["format"] = "jpg"
+            img_d["imgb64"] = b64_jpeg
+
+    def gif2jpg(self, img_d: dict):
+        fileName = img_d["fileName"]
+        new_fileName = img_d['fileName'].replace(".bmp", ".jpg")
+        b64_gif = self.get_xml_obj(fileName)
+        image_data = base64.b64decode(b64_gif)
+        image = Image.open(io.BytesIO(image_data))
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        output_buffer = io.BytesIO()
+        image.save(output_buffer, format="JPEG", quality=95)
+        image.close()
+        jpeg_bytes = output_buffer.getvalue()
+        b64_jpeg = base64.b64encode(jpeg_bytes).decode('utf-8')
+        output_buffer.close()
+
+        if b64_jpeg:
+            logger.info(f"gif2jpg处理成功{fileName}>>{new_fileName}")
+            img_d["fileName"] = new_fileName
+            img_d["suffix"] = "jpg"
+            img_d["format"] = "jpg"
+            img_d["imgb64"] = b64_jpeg
+
+    def parser(self, save_dir):
+        """
+        解析流程
+        doc_0默认只有 一层
+        OFD >  Document.xml > [DocumentRes.xml, PublicRes.xml, Signatures.xml Annotations.xml] > []
+        """
+
+        page_size_details = []
+        default_page_size = []
+        doc_list = []
+        ofd_xml_obj = self.get_xml_obj(self.file_tree["root_doc"])  # OFD.xml xml 对象 
+
+        if ofd_xml_obj:
+            ofd_obj_res = OFDFileParser(ofd_xml_obj)()
+            doc_root_name = ofd_obj_res.get("doc_root")
+            signatures = ofd_obj_res.get("signatures")
+        else:
+            # 考虑根节点丢失情况
+            doc_root_name = ["Doc_0/Document.xml"]
+            signatures = ["Doc_0/Signs/Signatures.xml"]
+
+        doc_root_xml_obj = self.get_xml_obj(doc_root_name[0])
+        doc_root_info = DocumentFileParser(doc_root_xml_obj)()
+        doc_page_size = self.get_page_size(doc_root_xml_obj)
+        # print('doc_page_size', doc_page_size)
+
+        # 注释文本
+        annotations_root_name = doc_root_info.get("Annotations")
+        if annotations_root_name:
+            annotations_root_name = annotations_root_name[0]
+            annot_root_xml_obj = self.get_xml_obj(annotations_root_name)
+            # print('annot_root_xml_obj', annot_root_xml_obj)
+            annot_root_info = AnnotationFileParser(annot_root_xml_obj)()
+            # print('annot_root_info', annot_root_info)
+            doc_root_info.update(annot_root_info)
+        doc_size = doc_root_info.get("size")
+
+        if doc_size:
+            try:
+                default_page_size = [float(pos_i) for pos_i in doc_size.split(" ") if re.match("[\d\.]", pos_i)]
+            except:
+                traceback.print_exc()
+
+        # 字体信息
+        font_info = {}
+        public_res_name: list = doc_root_info.get("public_res")
+        if public_res_name:
+            public_xml_obj = self.get_xml_obj(public_res_name[0])
+            font_info = PublicResFileParser(public_xml_obj)()
+
+            # 注册字体
+            for font_id, font_v in font_info.items():
+                file_name = font_v.get("FontFile")
+                if file_name:
+                    font_b64 = self.get_xml_obj(file_name)
+                    if font_b64:
+                        font_v["font_b64"] = font_b64
+
+        # 图片资源
+        img_info: dict = dict()
+        document_res_name: list = doc_root_info.get("document_res")
+        # print('doc_root_info', doc_root_info)
+        if document_res_name:
+            document_res_xml_obj = self.get_xml_obj(document_res_name[0])
+            # print('document_res_xml_obj', document_res_xml_obj)
+            img_info = DocumentResFileParser(document_res_xml_obj)()
+            # 找到图片b64
+            for img_id, img_v in img_info.items():
+                img_v["imgb64"] = self.get_xml_obj(img_v.get("fileName"))
+                img_v['fileName'] = f"{save_dir}Doc_0\Res\{img_v['fileName']}"
+                # todo ib2 转png C:/msys64/mingw64/bin/jbig2dec.exe -o F:\code\easyofd\test\image_80.png F:\code\easyofd\test\image_80.jb2
+                if img_v["suffix"] == 'jb2':
+                    self.jb22png(img_v)
+                elif img_v["suffix"] == 'bmp':
+                    self.bmp2jpg(img_v)
+                elif img_v["suffix"] == 'tif':
+                    self.tif2jpg(img_v)
+                elif img_v["suffix"] == 'gif':
+                    self.gif2jpg(img_v)
+
+        img_info2: dict = dict()
+        public_res_name: list = doc_root_info.get("public_res")
+        # print('doc_root_info', doc_root_info)
+        if public_res_name:
+            public_res_xml_obj = self.get_xml_obj(public_res_name[0])
+            # print('public_res_xml_obj', public_res_xml_obj)
+            img_info2 = DocumentResFileParser(public_res_xml_obj)()
+            # 找到图片b64
+            for img_id, img_v in img_info2.items():
+                img_v["imgb64"] = self.get_xml_obj(img_v.get("fileName"))
+                # print('img_id, img_v[filename]', img_id, img_v.get('fileName'))
+                img_v['fileName'] = f"{save_dir}Doc_0\Res\{img_v['fileName']}"
+
+                # todo ib2 转png C:/msys64/mingw64/bin/jbig2dec.exe -o F:\code\easyofd\test\image_80.png F:\code\easyofd\test\image_80.jb2
+                if img_v["suffix"] == 'jb2':
+                    self.jb22png(img_v)
+                elif img_v["suffix"] == 'bmp':
+                    self.bmp2jpg(img_v)
+                elif img_v["suffix"] == 'tif':
+                    self.tif2jpg(img_v)
+                elif img_v["suffix"] == 'gif':
+                    self.gif2jpg(img_v)
+            img_info.update(img_info2)
+
+        page_id_map: list = doc_root_info.get("page_id_map")
+        # print('doc_root_info', doc_root_info)
+
+        signatures_page_id = {}
+        # 签章信息
+        signatures_xml_obj = None
+        # if signatures:
+        #     signatures_xml_obj = self.get_xml_obj(signatures[0])
+        # if signatures and signatures_xml_obj:
+        # # if signatures and (signatures_xml_obj := self.get_xml_obj(signatures[0])):
+        # #     logger.debug(f"signatures_xml_obj is {signatures_xml_obj } signatures is {signatures} ")
+        #     signatures_info = SignaturesFileParser(signatures_xml_obj)()
+        #     if signatures_info:  # 获取签章具体信息
+        #         for _, signatures_cell in signatures_info.items():
+        #             # print(signatures_info)
+        #             BaseLoc = signatures_cell.get("BaseLoc")
+        #             signature_xml_obj = self.get_xml_obj(BaseLoc)
+        #             # print(BaseLoc)
+        #             prefix = BaseLoc.split("/")[0]
+        #             signatures_info = SignatureFileParser(signature_xml_obj)(prefix=prefix)
+        #             # print(signatures_info)
+        #             # logger.debug(f"signatures_info {signatures_info}")
+        #             PageRef = signatures_info.get("PageRef")
+        #             Boundary = signatures_info.get("Boundary")
+        #             SignedValue = signatures_info.get("SignedValue")
+        #             sing_page_no = page_id_map.get(PageRef)
+        #             # print("self.file_tree",self.file_tree.keys)
+        #             # print(page_id_map,PageRef)
+        #             # print(SignedValue, self.get_xml_obj(SignedValue))
+        #             # with open("b64.txt","w") as f:
+        #             #     f.write(self.get_xml_obj(SignedValue))
+        #             if signatures_page_id.get(sing_page_no):
+        #                 signatures_page_id[sing_page_no].append(
+        #                     {
+        #                         "sing_page_no": sing_page_no,
+        #                         "PageRef": PageRef,
+        #                         "Boundary": Boundary,
+        #                         "SignedValue": self.get_xml_obj(SignedValue),
+        #                     }
+        #                 )
+        #             else:
+        #                 signatures_page_id[sing_page_no] = [
+        #                     {
+        #                         "sing_page_no": sing_page_no,
+        #                         "PageRef": PageRef,
+        #                         "Boundary": Boundary,
+        #                         "SignedValue": self.get_xml_obj(SignedValue),
+        #                     }
+        #                 ]
+
+        # 注释信息
+        # print('doc_root_info', doc_root_info)
+        # annotation_name: list = doc_root_info.get("Annotations")
+        # annotation_xml_obj = None
+        # if annotation_name:
+        #     annotation_xml_obj = self.get_xml_obj(annotation_name[0])
+        # if annotation_name and annotation_xml_obj:
+        # # if annotation_name and (annotation_xml_obj:= self.get_xml_obj(annotation_name[0])):
+        #     # todo 注释解析
+        #
+        #     # annotation_info = AnnotationFileParser(annotation_xml_obj)()
+        #     annotation_info = AnnotationFileParser(annotation_xml_obj)()
+        #     # logger.debug(f"annotation_info is {annotation_info}")
+
+
+        # 正文信息 会有多页 情况
+        page_name: list = doc_root_info.get("page")
+        page_info_d = {}
+        if page_name:
+            for index, _page in enumerate(page_name):
+                page_xml_obj = self.get_xml_obj(_page)
+                # 重新获取页面size
+                try:
+                    page_size = [float(pos_i) for pos_i in
+                                     page_xml_obj.get('ofd:Page', {}).get("ofd:Area", {}).get("ofd:PhysicalBox",
+                                                                                              "").split(" ")
+                                     if re.match("[\d\.]", pos_i)]
+                    if page_size and len(page_size) >= 2:
+                        page_size_details.append(page_size)
+                    else:
+                        if doc_page_size:
+                            page_size_details.append(doc_page_size)
+                        else:
+                            page_size_details.append([])
+                except Exception as e:
+                    traceback.print_exc()
+                    page_size.append([])
+                page_info = ContentFileParser(page_xml_obj)()
+                pg_no = re.search(r"\d+", _page)
+                if pg_no:
+                    pg_no = int(pg_no.group())
+                else:
+                    pg_no = index
+                page_info_d[pg_no] = page_info
+                # 只跑一页
+                # print('odf_parser parser() 只跑一页')
+                # break
+
+        # 注释作为正文提取
+        annot_page_info_d = {}
+        annot_page_name: list = doc_root_info.get("annot_page")
+        if annot_page_name:
+            for index, _page in enumerate(annot_page_name):
+                annot_page_xml_obj = self.get_xml_obj(_page)
+                annot_page_info = ContentFileParser(annot_page_xml_obj)()
+                pg_no = re.search(r"\d+", _page)
+                if pg_no:
+                    pg_no = int(pg_no.group())
+                else:
+                    pg_no = index
+
+                # 重新获取页面size
+                # try:
+                #     page_size = [float(pos_i) for pos_i in
+                #                  annot_page_xml_obj.get('ofd:Page', {}).get("ofd:Area", {}).get("ofd:PhysicalBox",
+                #                                                                           "").split(" ")
+                #                  if re.match("[\d\.]", pos_i)]
+                #     if page_size and len(page_size) >= 2:
+                #         # page_size_details.append(page_size)
+                #         pass
+                #     else:
+                #         page_size = []
+                # except Exception as e:
+                #     traceback.print_exc()
+                #     page_size.append([])
+                page_size = self.get_page_size(annot_page_xml_obj)
+                # if not page_size:
+                #     page_size = doc_page_size
+
+                # annot_page_info['annot_page_size'] = page_size
+                annot_page_info_d[pg_no] = annot_page_info
+                # 只跑一页
+                # print('odf_parser parser() 只跑一页')
+                # break
+        # 注释文本信息合到正文信息中
+        for page_id, page_d in page_info_d.items():
+            if page_id not in annot_page_info_d.keys():
+                continue
+            annot_page_d = annot_page_info_d.get(page_id)
+            # print("annot_page_d.get('text_list')", annot_page_d.get('text_list'))
+            page_d['text_list'] += annot_page_d.get('text_list')
+            page_d['annot_text_list'] = annot_page_d.get('text_list')
+            # page_d['annot_page_size'] = annot_page_d.get('annot_page_size')
+        # print('page_info_d', page_info_d)
+        # print('annot_page_info_d', annot_page_info_d)
+
+        # 模板信息
+        tpls_name: list = doc_root_info.get("tpls")
+        # if tpls_name:
+        #     for index, _tpl in enumerate(tpls_name):
+        #         tpl_xml_obj = self.get_xml_obj(_tpl)
+        #         tpl_info = ContentFileParser(tpl_xml_obj)()
+        #         tpl_no = re.search(r"\d+", _tpl)
+        #
+        #         if tpl_no:
+        #             tpl_no = int(tpl_no.group())
+        #         else:
+        #             tpl_no = index
+        #
+        #         if tpl_no in page_info_d:
+        #             page_info_d[pg_no]["text_list"].extend(tpl_info["text_list"])
+        #             page_info_d[pg_no]["text_list"].sort(
+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
+        #             page_info_d[pg_no]["img_list"].extend(tpl_info["img_list"])
+        #             page_info_d[pg_no]["img_list"].sort(
+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
+        #             page_info_d[pg_no]["line_list"].extend(tpl_info["line_list"])
+        #             page_info_d[pg_no]["line_list"].sort(
+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
+        #         else:
+        #             page_info_d[tpl_no] = tpl_info
+        #             page_info_d[tpl_no].sort(
+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
+
+        # todo 读取注释信息
+        page_ID = 0  # 没遇到过doc多个的情况
+        # print("page_info",len(page_info))
+        doc_list.append({
+            "default_page_size": default_page_size,
+            "page_size": page_size_details,
+            "pdf_name": self.file_tree["pdf_name"],
+            "doc_no": page_ID,
+            "images": img_info,
+            "signatures_page_id": signatures_page_id,
+            "page_id_map": page_id_map,
+            "fonts": font_info,
+            "page_info": page_info_d,
+            "page_tpl_info": page_info_d,
+            "page_content_info": page_info_d,
+            # "annot_page_info": annot_page_info_d,
+        })
+        return doc_list
+
+    def get_page_size(self, page_xml_obj):
+        try:
+            page_size = [float(pos_i) for pos_i in page_xml_obj.get('ofd:Page', {}).get("ofd:Area", {}).get("ofd:PhysicalBox", "").split(" ")if re.match("[\d\.]", pos_i)]
+            if not (page_size and len(page_size) >= 2):
+                page_size = [float(pos_i) for pos_i in page_xml_obj.get('ofd:Document', {}).get('ofd:CommonData', {}).get("ofd:PageArea", {}).get("ofd:PhysicalBox", "").split(" ")if re.match("[\d\.]", pos_i)]
+                if not (page_size and len(page_size) >= 2):
+                    page_size = []
+        except Exception as e:
+            traceback.print_exc()
+            page_size = []
+        return page_size
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        """
+        输出ofd解析结果
+        """
+        save_xml = kwargs.get("save_xml", False)
+        xml_name = kwargs.get("xml_name")
+        save_dir = kwargs.get("save_dir")
+        self.file_tree = FileRead(self.ofdb64)(save_xml=save_xml, xml_name=xml_name, save_dir=save_dir)
+        # logger.info(self.file_tree)
+        return self.parser(save_dir)
+
+
+if __name__ == "__main__":
+    p = "C:/Users/Administrator/Downloads/1750060386706.ofd"
+    with open(p, "rb") as f:
+        ofdb64 = str(base64.b64encode(f.read()), "utf-8")
+    obj_list = OFDParser(ofdb64)()
+    for obj in obj_list:
+        print('obj', obj)

+ 31 - 0
format_convert/easyofd/easyofd/parser_ofd/parameter_parser.py

@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: easyofd
+# CREATE_TIME: 
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: renoyuan
+# note:参数解析器
+from loguru import logger
+from typing import List, Dict, Any, Union, Tuple, Optional
+
+
+class ParameterParser(object):
+    parameter = {
+        "ofd:FillColor": (dict, dict),
+        "ofd:StrokeColor": (dict, dict),
+        "ofd:Test": ((str, int), str),
+        "ofd:Font": (str, str),
+        "@Value": (str, str)
+    }
+
+    def __call__(self, key, container):
+        if key in ParameterParser.parameter:
+            v = container.get(key, None)
+            t = ParameterParser.parameter[key]
+            if isinstance(v, t[0]):
+                return v
+            else:
+                return t[1]()
+        else:
+            logger.warning(f"{key} not in ParameterParser")
+            return None

+ 61 - 0
format_convert/easyofd/easyofd/parser_ofd/path_parser.py

@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  path_parser.py
+# CREATE_TIME: 2025/4/9 16:31
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE:
+from enum import Enum
+import os
+
+class PathType(Enum):
+    absolutely = 1
+    relative = 2
+
+class PathParser:
+    """
+    Parser Path
+    路径解析器
+    解析文件路径返回绝对路径
+    "/ROOT/a.xml"
+    "./ROOT/a.xml"
+    "../ROOT/a.xml"
+    "ROOT/a.xml"
+    """
+
+    def __init__(self, root_path:str):
+        if os.name == 'nt':
+            self.os = "nt"
+        else:
+            self.os = "posix"
+
+        self.root_path = self.format_path(root_path)
+
+    def format_path(self,path:str):
+        normalized = os.path.normpath(path)
+        if self.os == "nt":
+            return normalized.replace("/","\\")
+        else:
+            return normalized.replace("\\","/")
+
+    def get_path_type(self, path:str):
+        if os.path.isabs(path):
+            return PathType.absolutely
+        else:
+            return PathType.relative
+
+    def __call__(self,cur_path:str,loc_path:str):
+        """
+        loc_path is posix style
+        """
+        path_type = self.get_path_type(loc_path)
+        if path_type == PathType.absolutely:
+            return self.format_path(loc_path)
+        if path_type == PathType.relative:
+            if loc_path.startswith("./"):
+                path = os.path.join(cur_path, self.format_path(loc_path[2:]))
+            elif loc_path.startswith("../"):
+                path = os.path.join(os.path.dirname(cur_path), self.format_path(loc_path[3:]))
+            else:
+                path = os.path.join(os.path.dirname(cur_path), self.format_path(loc_path))
+            return path

+ 7 - 0
format_convert/easyofd/easyofd/template_ofd/__init__.py

@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  __init__.py.py
+# CREATE_TIME: 2025/3/28 15:43
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE:

+ 53 - 0
format_convert/font_map/extend_to_normal_dict.txt

@@ -0,0 +1,53 @@
+{
+ "⺁":"厂",
+ "⺇":"几",
+ "⺌":"小",
+ "⺎":"兀",
+ "⺏":"尣",
+ "⺐":"尢",
+ "⺑":"𡯂",
+ "⺒":"巳",
+ "⺓":"幺",
+ "⺛":"旡",
+ "⺝":"月",
+ "⺟":"母",
+ "⺠":"民",
+ "⺱":"冈",
+ "⺸":"芈",
+ "⻁":"虎",
+ "⻄":"西",
+ "⻅":"见",
+ "⻆":"角",
+ "⻇":"𧢲",
+ "⻉":"贝",
+ "⻋":"车",
+ "⻒":"镸",
+ "⻓":"长",
+ "⻔":"门",
+ "⻗":"雨",
+ "⻘":"青",
+ "⻙":"韦",
+ "⻚":"页",
+ "⻛":"风",
+ "⻜":"飞",
+ "⻝":"食",
+ "⻡":"𩠐",
+ "⻢":"马",
+ "⻣":"骨",
+ "⻤":"鬼",
+ "⻥":"鱼",
+ "⻦":"鸟",
+ "⻧":"卤",
+ "⻨":"麦",
+ "⻩":"黄",
+ "⻬":"齐",
+ "⻮":"齿",
+ "⻯":"竜",
+ "⻰":"龙",
+ "⻳":"龟",
+ "⾅":"臼",
+ "⼝":"口",
+ "⼾":"户",
+ "⼉":"儿",
+ "⼱":"巾"
+}

+ 214 - 0
format_convert/font_map/kangxi_to_normal

@@ -0,0 +1,214 @@
+⼀ 2F00 一 4E00
+⼁ 2F01 丨 4E28
+⼂ 2F02 丶 4E36
+⼃ 2F03 丿 4E3F
+⼄ 2F04 乙 4E59
+⼅ 2F05 亅 4E85
+⼆ 2F06 二 4E8C
+⼇ 2F07 亠 4EA0
+⼈ 2F08 人 4EBA
+⼉ 2F09 儿 513F
+⼊ 2F0A 入 5165
+⼋ 2F0B 八 516B
+⼌ 2F0C 冂 5182
+⼍ 2F0D 冖 5196
+⼎ 2F0E 冫 51AB 
+⼏ 2F0F 几 51E0
+⼐ 2F10 凵 51F5
+⼑ 2F11 刀 5200
+⼒ 2F12 力 529B
+⼓ 2F13 勹 52F9
+⼔ 2F14 匕 5315 
+⼕ 2F15 匚 531A 
+⼖ 2F16 匸 5338 
+⼗ 2F17 十 5341
+⼘ 2F18 卜 535C
+⼙ 2F19 卩 5369
+⼚ 2F1A 厂 5382
+⼛ 2F1B 厶 53B6
+⼜ 2F1C 又 53C8
+⼝ 2F1D 口 53E3
+⼞ 2F1E 囗 56D7
+⼟ 2F1F 土 571F
+⼠ 2F20 士 58EB
+⼡ 2F21 夂 5902
+⼢ 2F22 夊 590A
+⼣ 2F23 夕 5915
+⼤ 2F24 大 5927
+⼥ 2F25 女 5973
+⼦ 2F26 子 5B50
+⼧ 2F27 宀 5B80
+⼨ 2F28 寸 5BF8
+⼩ 2F29 小 5C0F
+⼪ 2F2A 尢 5C22
+⼫ 2F2B 尸 5C38
+⼬ 2F2C 屮 5C6E
+⼭ 2F2D 山 5C71
+⼮ 2F2E 巛 5DDB
+⼯ 2F2F 工 5DE5
+⼰ 2F30 己 5DF1
+⼱ 2F31 巾 5DFE
+⼲ 2F32 干 5E72
+⼳ 2F33 幺 5E7A
+⼴ 2F34 广 5E7F
+⼵ 2F35 廴 5EF4
+⼶ 2F36 廾 5EFE
+⼷ 2F37 弋 5F0B
+⼸ 2F38 弓 5F13
+⼹ 2F39 彐 5F50
+⼺ 2F3A 彡 5F61
+⼻ 2F3B 彳 5F73
+⼼ 2F3C 心 5FC3
+⼽ 2F3D 戈 6208
+⼾ 2F3E 戶 6236
+⼿ 2F3F 手 624B
+⽀ 2F40 支 652F
+⽁ 2F41 攴 6534
+⽂ 2F42 文 6587
+⽃ 2F43 斗 6597
+⽄ 2F44 斤 65A4
+⽅ 2F45 方 65B9
+⽆ 2F46 无 65E0
+⽇ 2F47 日 65E5
+⽈ 2F48 曰 66F0
+⽉ 2F49 月 6708
+⽊ 2F4A 木 6728
+⽋ 2F4B 欠 6B20
+⽌ 2F4C 止 6B62
+⽍ 2F4D 歹 6B79
+⽎ 2F4E 殳 6BB3
+⽏ 2F4F 毋 6BCB
+⽐ 2F50 比 6BD4
+⽑ 2F51 毛 6BDB
+⽒ 2F52 氏 6C0F
+⽓ 2F53 气 6C14
+⽔ 2F54 水 6C34
+⽕ 2F55 火 706B
+⽖ 2F56 爪 722A
+⽗ 2F57 父 7236
+⽘ 2F58 爻 723B
+⽙ 2F59 爿 723F
+⽚ 2F5A 片 7247
+⽛ 2F5B 牙 7259
+⽜ 2F5C 牛 725B
+⽝ 2F5D 犬 72AC
+⽞ 2F5E 玄 7384
+⽟ 2F5F 玉 7389
+⽠ 2F60 瓜 74DC
+⽡ 2F61 瓦 74E6
+⽢ 2F62 甘 7518
+⽣ 2F63 生 751F
+⽤ 2F64 用 7528
+⽥ 2F65 田 7530
+⽦ 2F66 疋 758B
+⽧ 2F67 疒 7592
+⽨ 2F68 癶 7676
+⽩ 2F69 白 767D
+⽪ 2F6A 皮 76AE
+⽫ 2F6B 皿 76BF
+⽬ 2F6C 目 76EE
+⽭ 2F6D 矛 77DB
+⽮ 2F6E 矢 77E2
+⽯ 2F6F 石 77F3
+⽰ 2F70 示 793A
+⽱ 2F71 禸 79B8
+⽲ 2F72 禾 79BE
+⽳ 2F73 穴 7A74
+⽴ 2F74 立 7ACB
+⽵ 2F75 竹 7AF9
+⽶ 2F76 米 7C73
+⽷ 2F77 糸 7CF8
+⽸ 2F78 缶 7F36
+⽹ 2F79 网 7F51
+⽺ 2F7A 羊 7F8A
+⽻ 2F7B 羽 7FBD
+⽼ 2F7C 老 8001
+⽽ 2F7D 而 800C
+⽾ 2F7E 耒 8012
+⽿ 2F7F 耳 8033
+⾀ 2F80 聿 807F
+⾁ 2F81 肉 8089
+⾂ 2F82 臣 81E3
+⾃ 2F83 自 81EA
+⾄ 2F84 至 81F3
+⾅ 2F85 臼 81FC
+⾆ 2F86 舌 820C
+⾇ 2F87 舛 821B
+⾈ 2F88 舟 821F
+⾉ 2F89 艮 826E
+⾊ 2F8A 色 8272
+⾋ 2F8B 艸 8278
+⾌ 2F8C 虍 864D
+⾍ 2F8D 虫 866B
+⾎ 2F8E 血 8840
+⾏ 2F8F 行 884C
+⾐ 2F90 衣 8863
+⾑ 2F91 襾 897E
+⾒ 2F92 見 898B
+⾓ 2F93 角 89D2
+⾔ 2F94 言 8A00
+⾕ 2F95 谷 8C37
+⾖ 2F96 豆 8C46
+⾗ 2F97 豕 8C55
+⾘ 2F98 豸 8C78
+⾙ 2F99 貝 8C9D
+⾚ 2F9A 赤 8D64
+⾛ 2F9B 走 8D70
+⾜ 2F9C 足 8DB3
+⾝ 2F9D 身 8EAB
+⾞ 2F9E 車 8ECA
+⾟ 2F9F 辛 8F9B
+⾠ 2FA0 辰 8FB0
+⾡ 2FA1 辵 8FB5
+⾢ 2FA2 邑 9091
+⾣ 2FA3 酉 9149
+⾤ 2FA4 采 91C7
+⾥ 2FA5 里 91CC
+⾦ 2FA6 金 91D1
+⾧ 2FA7 長 9577
+⾨ 2FA8 門 9580
+⾩ 2FA9 阜 961C
+⾪ 2FAA 隶 96B6
+⾫ 2FAB 隹 96B9
+⾬ 2FAC 雨 96E8
+⾭ 2FAD 青 9752
+⾮ 2FAE 非 975E
+⾯ 2FAF 面 9762
+⾰ 2FB0 革 9769
+⾱ 2FB1 韋 97CB
+⾲ 2FB2 韭 97ED
+⾳ 2FB3 音 97F3
+⾴ 2FB4 頁 9801
+⾵ 2FB5 風 98A8
+⾶ 2FB6 飛 98DB
+⾷ 2FB7 食 98DF
+⾸ 2FB8 首 9996
+⾹ 2FB9 香 9999
+⾺ 2FBA 馬 99AC
+⾻ 2FBB 骨 9AA8
+⾼ 2FBC 高 9AD8
+⾽ 2FBD 髟 9ADF
+⾾ 2FBE 鬥 9B25
+⾿ 2FBF 鬯 9B2F
+⿀ 2FC0 鬲 9B32
+⿁ 2FC1 鬼 9B3C
+⿂ 2FC2 魚 9B5A
+⿃ 2FC3 鳥 9CE5
+⿄ 2FC4 鹵 9E75
+⿅ 2FC5 鹿 9E7F
+⿆ 2FC6 麥 9EA5
+⿇ 2FC7 麻 9EBB
+⿈ 2FC8 黃 9EC3
+⿉ 2FC9 黍 9ECD
+⿊ 2FCA 黑 9ED1
+⿋ 2FCB 黹 9EF9
+⿌ 2FCC 黽 9EFD
+⿍ 2FCD 鼎 9F0E
+⿎ 2FCE 鼓 9F13
+⿏ 2FCF 鼠 9F20
+⿐ 2FD0 鼻 9F3B
+⿑ 2FD1 齊 9F4A
+⿒ 2FD2 齒 9F52
+⿓ 2FD3 龍 9F8D
+⿔ 2FD4 龜 9F9C
+⿕ 2FD5 龠 9FA0

+ 154 - 0
format_convert/font_map/kangxi_to_normal_dict.txt

@@ -0,0 +1,154 @@
+{
+    "⼀": "一",
+    "⼄": "乙",
+    "⼆": "二",
+    "⼈": "人",
+    "⼉": "儿",
+    "⼊": "入",
+    "⼋": "八",
+    "⼏": "几",
+    "⼑": "刀",
+    "⼒": "力",
+    "⼔": "匕",
+    "⼗": "十",
+    "⼘": "卜",
+    "⼚": "厂",
+    "⼜": "又",
+    "⼝": "口",
+    "⼞": "口",
+    "⼟": "土",
+    "⼠": "士",
+    "⼤": "大",
+    "⼥": "女",
+    "⼦": "子",
+    "⼨": "寸",
+    "⼩": "小",
+    "⼫": "尸",
+    "⼭": "山",
+    "⼯": "工",
+    "⼰": "己",
+    "⼲": "干",
+    "⼴": "广",
+    "⼸": "弓",
+    "⼼": "心",
+    "⼽": "戈",
+    "⼿": "手",
+    "⽀": "支",
+    "⽂": "文",
+    "⽃": "斗",
+    "⽄": "斤",
+    "⽅": "方",
+    "⽆": "无",
+    "⽇": "日",
+    "⽈": "曰",
+    "⽉": "月",
+    "⽊": "木",
+    "⽋": "欠",
+    "⽌": "止",
+    "⽍": "歹",
+    "⽏": "毋",
+    "⽐": "比",
+    "⽑": "毛",
+    "⽒": "氏",
+    "⽓": "气",
+    "⽔": "水",
+    "⽕": "火",
+    "⽖": "爪",
+    "⽗": "父",
+    "⽚": "片",
+    "⽛": "牙",
+    "⽜": "牛",
+    "⽝": "犬",
+    "⽞": "玄",
+    "⽟": "玉",
+    "⽠": "瓜",
+    "⽡": "瓦",
+    "⽢": "甘",
+    "⽣": "生",
+    "⽤": "用",
+    "⽥": "田",
+    "⽩": "白",
+    "⽪": "皮",
+    "⽫": "皿",
+    "⽬": "目",
+    "⽭": "矛",
+    "⽮": "矢",
+    "⽯": "石",
+    "⽰": "示",
+    "⽲": "禾",
+    "⽳": "穴",
+    "⽴": "立",
+    "⽵": "竹",
+    "⽶": "米",
+    "⽸": "缶",
+    "⽹": "网",
+    "⽺": "羊",
+    "⽻": "羽",
+    "⽼": "老",
+    "⽽": "而",
+    "⽿": "耳",
+    "⾁": "肉",
+    "⾂": "臣",
+    "⾃": "自",
+    "⾄": "至",
+    "⾆": "舌",
+    "⾈": "舟",
+    "⾉": "艮",
+    "⾊": "色",
+    "⾍": "虫",
+    "⾎": "血",
+    "⾏": "行",
+    "⾐": "衣",
+    "⾒": "儿",
+    "⾓": "角",
+    "⾔": "言",
+    "⾕": "谷",
+    "⾖": "豆",
+    "⾚": "赤",
+    "⾛": "走",
+    "⾜": "足",
+    "⾝": "身",
+    "⾞": "车",
+    "⾟": "辛",
+    "⾠": "辰",
+    "⾢": "邑",
+    "⾣": "酉",
+    "⾤": "采",
+    "⾥": "里",
+    "⾦": "金",
+    "⾧": "长",
+    "⾨": "门",
+    "⾩": "阜",
+    "⾪": "隶",
+    "⾬": "雨",
+    "⾭": "青",
+    "⾮": "非",
+    "⾯": "面",
+    "⾰": "革",
+    "⾲": "韭",
+    "⾳": "音",
+    "⾴": "页",
+    "⾵": "风",
+    "⾶": "飞",
+    "⾷": "食",
+    "⾸": "首",
+    "⾹": "香",
+    "⾺": "马",
+    "⾻": "骨",
+    "⾼": "高",
+    "⿁": "鬼",
+    "⿂": "鱼",
+    "⿃": "鸟",
+    "⿄": "卤",
+    "⿅": "鹿",
+    "⿇": "麻",
+    "⿉": "黍",
+    "⿊": "黑",
+    "⿍": "鼎",
+    "⿎": "鼓",
+    "⿏": "鼠",
+    "⿐": "鼻",
+    "⿒": "齿",
+    "⿓": "龙",
+    "⼣": "夕"
+}

+ 327 - 0
format_convert/ofd/ofd_parser.py

@@ -0,0 +1,327 @@
+import os
+import zipfile
+import xml.etree.ElementTree as ET
+from typing import Dict, List, Any, Optional
+from pathlib import Path
+
+
+class OFDParser:
+    """OFD文件解析器"""
+
+    def __init__(self, ofd_path: str):
+        """初始化解析器并验证OFD文件"""
+        self.ofd_path = ofd_path
+        self.temp_dir = Path("./ofd_temp")
+        self.ofd_info = {}
+        self.documents = []
+
+        if not os.path.exists(ofd_path):
+            raise FileNotFoundError(f"OFD文件不存在: {ofd_path}")
+
+        if not zipfile.is_zipfile(ofd_path):
+            raise ValueError(f"文件不是有效的OFD文件(Zip格式): {ofd_path}")
+
+    def parse(self) -> Dict[str, Any]:
+        """解析OFD文件并返回内容结构"""
+        try:
+            self._extract_ofd()
+            self._parse_ofd_xml()
+            self._parse_documents()
+            return {
+                "file_info": self.ofd_info,
+                "documents": self.documents
+            }
+        finally:
+            self._cleanup()
+
+    def _extract_ofd(self) -> None:
+        """解压OFD文件到临时目录"""
+        self.temp_dir.mkdir(exist_ok=True)
+        with zipfile.ZipFile(self.ofd_path, 'r') as zip_ref:
+            zip_ref.extractall(self.temp_dir)
+
+    def _parse_ofd_xml(self) -> None:
+        """解析OFD.xml文件获取基本信息"""
+        ofd_xml_path = self.temp_dir / "OFD.xml"
+        if not ofd_xml_path.exists():
+            raise ValueError("OFD.xml文件缺失")
+
+        root = ET.parse(ofd_xml_path).getroot()
+        namespace = {'ofd': 'http://www.ofdspec.org/2016'}
+
+        # 解析文档基本信息
+        doc_body = root.find('ofd:DocBody', namespace)
+        if doc_body is not None:
+            # 解析文档根信息
+            doc_file = doc_body.find('ofd:DocFile', namespace)
+            if doc_file is not None:
+                self.ofd_info['doc_file'] = doc_file.text
+
+            # 解析签名信息
+            signatures = doc_body.find('ofd:Signatures', namespace)
+            if signatures is not None:
+                self.ofd_info['signatures'] = {
+                    'file': signatures.get('FileRef'),
+                    'count': int(signatures.get('Count', 0))
+                }
+
+    def _parse_documents(self) -> None:
+        """解析文档内容"""
+        # 获取所有Document.xml文件
+        doc_xml_files = list(self.temp_dir.rglob("Document.xml"))
+        for doc_xml in doc_xml_files:
+            doc_info = self._parse_document(doc_xml)
+            self.documents.append(doc_info)
+
+    def _parse_document(self, doc_xml_path: Path) -> Dict[str, Any]:
+        """解析单个文档"""
+        namespace = {'ofd': 'http://www.ofdspec.org/2016'}
+        root = ET.parse(doc_xml_path).getroot()
+
+        document = {
+            'path': str(doc_xml_path),
+            'pages': [],
+            'fonts': self._parse_fonts(root, namespace),
+            'metadata': self._parse_metadata(root, namespace)
+        }
+
+        # 解析页面信息
+        pages_node = root.find('.//ofd:Pages', namespace)
+        if pages_node is not None:
+            page_references = pages_node.findall('ofd:Page', namespace)
+            for page_ref in page_references:
+                page_id = page_ref.get('ID')
+                page_file = page_ref.find('ofd:PageFile', namespace)
+                if page_file is not None:
+                    page_path = self.temp_dir / page_file.text
+                    if page_path.exists():
+                        page_info = self._parse_page(page_path)
+                        document['pages'].append({
+                            'id': page_id,
+                            'content': page_info
+                        })
+
+        return document
+
+    def _parse_fonts(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, str]]:
+        """解析文档字体信息"""
+        fonts = []
+        font_list = root.find('.//ofd:Fonts', ns)
+        if font_list is not None:
+            for font_node in font_list.findall('ofd:Font', ns):
+                font = {
+                    'id': font_node.get('ID'),
+                    'name': font_node.get('FontName'),
+                    'family': font_node.get('FamilyName'),
+                    'format': font_node.get('FontFormat'),
+                    'bold': font_node.get('Bold') == 'true',
+                    'italic': font_node.get('Italic') == 'true',
+                    'serif': font_node.get('Serif') == 'true',
+                    'fixed_width': font_node.get('FixedWidth') == 'true'
+                }
+                fonts.append(font)
+        return fonts
+
+    def _parse_metadata(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, str]:
+        """解析文档元数据"""
+        metadata = {}
+        doc_info = root.find('.//ofd:DocInfo', ns)
+        if doc_info is not None:
+            for attr in ['Title', 'Author', 'Subject', 'Keywords', 'Creator',
+                         'CreatorVersion', 'CreationDate', 'ModDate']:
+                element = doc_info.find(f'ofd:{attr}', ns)
+                if element is not None and element.text:
+                    metadata[attr] = element.text
+        return metadata
+
+    def _parse_page(self, page_path: Path) -> Dict[str, Any]:
+        """解析页面内容"""
+        namespace = {
+            'ofd': 'http://www.ofdspec.org/2016',
+            'ofdtext': 'http://www.ofdspec.org/2016',
+            'ofdgraph': 'http://www.ofdspec.org/2016',
+            'ofdimg': 'http://www.ofdspec.org/2016'
+        }
+        root = ET.parse(page_path).getroot()
+
+        page = {
+            'size': self._parse_page_size(root, namespace),
+            'text_content': self._extract_text_content(root, namespace),
+            'images': self._extract_images(root, namespace),
+            'graphics': self._extract_graphics(root, namespace),
+            'layers': self._parse_layers(root, namespace)
+        }
+
+        return page
+
+    def _parse_page_size(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, float]:
+        """解析页面尺寸"""
+        box = root.find('.//ofd:Area/ofd:PhysicalBox', ns)
+        if box is not None:
+            return {
+                'width': float(box.get('Width', 0)),
+                'height': float(box.get('Height', 0)),
+                'x': float(box.get('x', 0)),
+                'y': float(box.get('y', 0))
+            }
+        return {'width': 0, 'height': 0, 'x': 0, 'y': 0}
+
+    def _extract_text_content(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
+        """提取页面文本内容,包含位置和样式信息"""
+        text_objects = root.findall('.//ofdtext:TextObject', ns)
+        texts = []
+
+        for text_obj in text_objects:
+            # 获取文本对象的基本属性
+            text_info = {
+                'id': text_obj.get('ID'),
+                'bounding_box': {
+                    'x': float(text_obj.get('BoundaryBox').split()[0]),
+                    'y': float(text_obj.get('BoundaryBox').split()[1]),
+                    'width': float(text_obj.get('BoundaryBox').split()[2]),
+                    'height': float(text_obj.get('BoundaryBox').split()[3])
+                },
+                'transform': text_obj.get('CTM'),
+                'content': []
+            }
+
+            # 获取文本样式
+            style = text_obj.find('ofdtext:TextStyle', ns)
+            if style is not None:
+                text_info['style'] = {
+                    'font': style.get('Font'),
+                    'size': float(style.get('Size', 0)),
+                    'color': style.get('FillColor'),
+                    'weight': style.get('Weight'),
+                    'italic': style.get('Italic') == 'true',
+                    'underline': style.get('Underline') == 'true',
+                    'strikeout': style.get('StrikeOut') == 'true'
+                }
+
+            # 提取实际文本内容
+            text_codecs = text_obj.findall('ofdtext:TextCode', ns)
+            for codec in text_codecs:
+                if codec.text:
+                    text_info['content'].append({
+                        'text': codec.text.strip(),
+                        'position': {
+                            'x': float(codec.get('X', 0)),
+                            'y': float(codec.get('Y', 0))
+                        }
+                    })
+
+            if text_info['content']:
+                texts.append(text_info)
+
+        return texts
+
+    def _extract_images(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
+        """提取页面中的图像信息"""
+        images = []
+        image_objects = root.findall('.//ofdimg:ImageObject', ns)
+
+        for img_obj in image_objects:
+            image = {
+                'id': img_obj.get('ID'),
+                'bounding_box': {
+                    'x': float(img_obj.get('BoundaryBox').split()[0]),
+                    'y': float(img_obj.get('BoundaryBox').split()[1]),
+                    'width': float(img_obj.get('BoundaryBox').split()[2]),
+                    'height': float(img_obj.get('BoundaryBox').split()[3])
+                },
+                'resource_id': img_obj.get('ResourceID'),
+                'transform': img_obj.get('CTM')
+            }
+            images.append(image)
+
+        return images
+
+    def _extract_graphics(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
+        """提取页面中的图形信息"""
+        graphics = []
+        graphic_objects = root.findall('.//ofdgraph:PathObject', ns)
+
+        for graphic_obj in graphic_objects:
+            graphic = {
+                'id': graphic_obj.get('ID'),
+                'bounding_box': {
+                    'x': float(graphic_obj.get('BoundaryBox').split()[0]),
+                    'y': float(graphic_obj.get('BoundaryBox').split()[1]),
+                    'width': float(graphic_obj.get('BoundaryBox').split()[2]),
+                    'height': float(graphic_obj.get('BoundaryBox').split()[3])
+                },
+                'fill_color': graphic_obj.get('FillColor'),
+                'stroke_color': graphic_obj.get('StrokeColor'),
+                'line_width': float(graphic_obj.get('LineWidth', 0)),
+                'path_data': graphic_obj.find('ofdgraph:PathData', ns).text if graphic_obj.find('ofdgraph:PathData',
+                                                                                                ns) is not None else ''
+            }
+            graphics.append(graphic)
+
+        return graphics
+
+    def _parse_layers(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
+        """解析页面图层信息"""
+        layers = []
+        layer_nodes = root.findall('.//ofd:Layer', ns)
+
+        for layer in layer_nodes:
+            layer_info = {
+                'type': layer.get('Type'),
+                'objects': {
+                    'text': len(layer.findall('.//ofdtext:TextObject', ns)),
+                    'images': len(layer.findall('.//ofdimg:ImageObject', ns)),
+                    'graphics': len(layer.findall('.//ofdgraph:PathObject', ns))
+                }
+            }
+            layers.append(layer_info)
+
+        return layers
+
+    def _cleanup(self) -> None:
+        """清理临时文件"""
+        import shutil
+        # if self.temp_dir.exists():
+        #     shutil.rmtree(self.temp_dir)
+
+
+# 使用示例
+if __name__ == "__main__":
+    try:
+        p = "C:/Users/Administrator/Downloads/1750060386706.ofd"
+        parser = OFDParser(p)
+        result = parser.parse()
+
+        # 打印文档基本信息
+        print("文档信息:", result["file_info"])
+
+        # 打印所有页面的文本内容
+        for doc_idx, document in enumerate(result["documents"], 1):
+            print(f"\n文档 {doc_idx}:")
+            print(f"  字体数量: {len(document['fonts'])}")
+            print(f"  页面数量: {len(document['pages'])}")
+
+            # 打印文档元数据
+            if document['metadata']:
+                print("  元数据:")
+                for key, value in document['metadata'].items():
+                    print(f"    {key}: {value}")
+
+            # 打印页面内容摘要
+            for page_idx, page in enumerate(document["pages"], 1):
+                print(f"\n  页面 {page_idx}:")
+                print(f"    尺寸: {page['content']['size']['width']} x {page['content']['size']['height']}")
+                print(f"    文本元素: {len(page['content']['text_content'])}")
+                print(f"    图像元素: {len(page['content']['images'])}")
+                print(f"    图形元素: {len(page['content']['graphics'])}")
+                print(f"    图层数量: {len(page['content']['layers'])}")
+
+                # 打印前5行文本
+                if page['content']['text_content']:
+                    print("    前5行文本:")
+                    for i, text_elem in enumerate(page['content']['text_content'][:5]):
+                        text_lines = " ".join([t['text'] for t in text_elem['content']])
+                        print(f"      {i + 1}. {text_lines[:50]}{'...' if len(text_lines) > 50 else ''}")
+
+    except Exception as e:
+        print(f"解析OFD文件时出错: {e}")

+ 320 - 12
format_convert/utils.py

@@ -9,13 +9,18 @@ import pickle
 import socket
 import socket
 import subprocess
 import subprocess
 import sys
 import sys
+from glob import glob
 from io import BytesIO
 from io import BytesIO
 from subprocess import Popen
 from subprocess import Popen
+import pynvml
+import datetime
+import PyPDF2
 from shapely.geometry import LineString
 from shapely.geometry import LineString
 import cv2
 import cv2
 import requests
 import requests
 from PIL import Image
 from PIL import Image
-
+from reportlab.pdfbase import pdfmetrics
+from reportlab.pdfbase.ttfonts import TTFont
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 import difflib
 import difflib
 import logging
 import logging
@@ -43,6 +48,14 @@ from shapely.geometry import Polygon
 
 
 config_file_path = os.path.dirname(os.path.abspath(__file__)) + "/../config/interface_new.yml"
 config_file_path = os.path.dirname(os.path.abspath(__file__)) + "/../config/interface_new.yml"
 
 
+# 特殊中文转基本中文
+with open(os.path.abspath(os.path.dirname(__file__)) + '/font_map/extend_to_normal_dict.txt', 'r', encoding='utf-8') as f:
+    extend_to_normal_dict = f.read()
+    extend_to_normal_dict = eval(extend_to_normal_dict)
+with open(os.path.abspath(os.path.dirname(__file__)) + '/font_map/kangxi_to_normal_dict.txt', 'r', encoding='utf-8') as f:
+    kangxi_to_normal_dict = f.read()
+    kangxi_to_normal_dict = eval(kangxi_to_normal_dict)
+
 
 
 def has_intersection(poly1, poly2):
 def has_intersection(poly1, poly2):
     """
     """
@@ -62,7 +75,7 @@ def has_intersection(poly1, poly2):
 
 
 
 
 def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13,
 def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13,
-                                  -14, -15, -16, -17, -18, -19, -20, -21, -22]):
+                                  -14, -15, -16, -17, -18, -19, -20, -21, -22, -23]):
     """
     """
     [0] : continue
     [0] : continue
     [-1]: 逻辑处理错误
     [-1]: 逻辑处理错误
@@ -87,6 +100,7 @@ def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -1
     [-20]: requests请求超时
     [-20]: requests请求超时
     [-21]: requests请求返回错误状态码
     [-21]: requests请求返回错误状态码
     [-22]: requests请求拒绝连接
     [-22]: requests请求拒绝连接
+    [-23]: 两列无边框表格提取报错
     """
     """
     for c in code:
     for c in code:
         if isinstance(_list, list) and _list == [c]:
         if isinstance(_list, list) and _list == [c]:
@@ -366,11 +380,45 @@ def slash_replace(_str, reverse=False):
     return _str
     return _str
 
 
 
 
+def align_table_lines(line_list, threshold=7):
+    """
+    对齐横线竖线,包括越过合并单元格的线
+    否则在生成表格时会因为线错位出错
+
+    :return:
+    """
+    rows = []
+    cols = []
+    for line in line_list:
+        x0, y0, x1, y1 = line.bbox
+        if abs(x0-x1) > abs(y0-y1):
+            rows.append(line)
+        else:
+            cols.append(line)
+    if not rows or not cols:
+        return line_list
+
+    rows.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
+    last_line = rows[0]
+    for line in rows[1:]:
+        if abs(line.bbox[1] - last_line.bbox[1]) <= threshold and line.bbox[1] != last_line.bbox[1]:
+            last_line.bbox = (last_line.bbox[0], line.bbox[1], last_line.bbox[2], line.bbox[3])
+        last_line = line
+
+    cols.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
+    last_line = cols[0]
+    for line in cols[1:]:
+        if abs(line.bbox[0] - last_line.bbox[0]) <= threshold and line.bbox[0] != last_line.bbox[0]:
+            last_line.bbox = (line.bbox[0], last_line.bbox[1], line.bbox[2], last_line.bbox[3])
+        last_line = line
+    line_list = rows + cols
+    return line_list
+
+
 class LineTable:
 class LineTable:
     def recognize_table(self, list_textbox, list_line, sourceP_LB=False,
     def recognize_table(self, list_textbox, list_line, sourceP_LB=False,
                         splited=False, from_pdf=False, is_reverse=False, show=0):
                         splited=False, from_pdf=False, is_reverse=False, show=0):
         self.list_line = list_line
         self.list_line = list_line
-        self.list_crosspoints = self.recognize_crosspoints(list_line)
         self.from_pdf = from_pdf
         self.from_pdf = from_pdf
         self.splited = splited
         self.splited = splited
         self.connect_bbox_list = []
         self.connect_bbox_list = []
@@ -381,6 +429,13 @@ class LineTable:
             # 展示原始表格及文字
             # 展示原始表格及文字
             self._plot(list_line, list_textbox, title='list_line,list_textbox')
             self._plot(list_line, list_textbox, title='list_line,list_textbox')
 
 
+        list_line = align_table_lines(list_line)
+        if self.show:
+            self._plot(list_line, list_textbox, title='align_table_lines')
+
+        # 获取交点
+        self.list_crosspoints = self.recognize_crosspoints(list_line)
+
         # 聚类
         # 聚类
         cluster_crosspoints = []
         cluster_crosspoints = []
         for _point in self.list_crosspoints:
         for _point in self.list_crosspoints:
@@ -1189,6 +1244,15 @@ class LineTable:
 
 
     def fix_rect(self, _table, list_x, list_y, sourceP_LB, margin):
     def fix_rect(self, _table, list_x, list_y, sourceP_LB, margin):
         self.fix_span(_table, list_x, list_y, sourceP_LB)
         self.fix_span(_table, list_x, list_y, sourceP_LB)
+        if self.show:
+            # 打印_table
+            temp_list = []
+            for t in _table:
+                print('------ fix_span row ------')
+                for c in t:
+                    print('fix_span col', c)
+                    temp_list.append(c)
+            self._plot([], [], temp_list, title='fix_span table')
 
 
         for _line in _table:
         for _line in _table:
             _line.sort(key=lambda x: x.get('bbox')[0])
             _line.sort(key=lambda x: x.get('bbox')[0])
@@ -1646,7 +1710,7 @@ def sort_object(obj_list, is_reverse=False):
     if len(obj_list) == 0:
     if len(obj_list) == 0:
         return obj_list
         return obj_list
     if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
     if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
-        obj_list.sort(key=lambda x: (x.y, x.x), reverse=is_reverse)
+        obj_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]), reverse=is_reverse)
         return obj_list
         return obj_list
     elif isinstance(obj_list[0], _Page):
     elif isinstance(obj_list[0], _Page):
         obj_list.sort(key=lambda x: x.page_no)
         obj_list.sort(key=lambda x: x.page_no)
@@ -2544,6 +2608,237 @@ def dynamic_get_port(start_port, mode='-1', num=10):
     return None
     return None
 
 
 
 
+def text_bbox_to_lt(text_list, bbox_list):
+    from format_convert.convert_tree import TextBox
+    lt_text_box_list = []
+    for i in range(len(bbox_list)):
+        bbox = bbox_list[i]
+        b_text = text_list[i]
+        lt_text_box_list.append(TextBox([bbox[0][0], bbox[0][1], bbox[2][0], bbox[2][1]], b_text))
+    return lt_text_box_list
+
+
+def extract_one_page_pdf(input_pdf_path, output_pdf_path, page_no):
+    try:
+        # 打开源PDF文件
+        with open(input_pdf_path, 'rb') as input_file:
+            pdf_reader = PyPDF2.PdfFileReader(input_file)
+
+            # 检查页码是否有效
+            if page_no < 0 or page_no >= len(pdf_reader.pages):
+                print("页码超出范围")
+                return
+
+            # 创建一个新的PDF写入对象
+            pdf_writer = PyPDF2.PdfFileWriter()
+
+            # 添加指定页到写入对象
+            pdf_writer.addPage(pdf_reader.pages[page_no])
+
+            # 将新的PDF写入到输出文件
+            with open(output_pdf_path, 'wb') as output_file:
+                pdf_writer.write(output_file)
+
+        print(f"成功提取第 {page_no + 1} 页并保存为 {output_pdf_path}")
+    except Exception as e:
+        print(f"提取页面失败:{e}")
+
+
+def get_gpu_memory_usage():
+    try:
+        # 初始化 NVML
+        pynvml.nvmlInit()
+        # 获取 GPU 设备数量
+        device_count = pynvml.nvmlDeviceGetCount()
+        # 获取当前时间
+        now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+
+        # 遍历每个 GPU
+        for i in range(device_count):
+            # 获取 GPU 句柄
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+
+            # 获取 GPU 名称
+            gpu_name = pynvml.nvmlDeviceGetName(handle)
+
+            # 获取显存信息
+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            total_memory = mem_info.total / (1024 * 1024)  # 转换为 MiB
+            used_memory = mem_info.used / (1024 * 1024)   # 转换为 MiB
+            free_memory = mem_info.free / (1024 * 1024)   # 转换为 MiB
+
+            info = f'  时间:{now}\n'
+            info += f"  GPU信息 {i}: {gpu_name.decode('utf-8')}\n"
+            info += f"    总显存: {total_memory:.2f} MiB\n"
+            info += f"    已用显存: {used_memory:.2f} MiB\n"
+            info += f"    剩余显存: {free_memory:.2f} MiB\n\n"
+
+            # 获取进程信息
+            processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
+            if processes:
+                info += f"  GPU进程信息: {i}\n"
+                for p in processes:
+                    pid = p.pid
+                    used_memory = p.usedGpuMemory / (1024 * 1024)
+                    try:
+                        # 获取进程的启动命令
+                        proc = psutil.Process(pid)
+                        cmdline = proc.cmdline()
+                        info += f"    {' '.join(cmdline)[-17:-14]} {pid}: {used_memory:.2f} MiB\n"
+                    except:
+                        traceback.print_exc()
+            print(info)
+
+        # 关闭 NVML
+        pynvml.nvmlShutdown()
+    except:
+        traceback.print_exc()
+        pass
+
+
+def get_current_process_gpu_id():
+    try:
+        # 初始化 NVML
+        pynvml.nvmlInit()
+
+        # 获取当前进程的 PID
+        current_pid = os.getpid()
+        # print(f"Current PID: {current_pid}")
+
+        # 获取 GPU 设备数量
+        device_count = pynvml.nvmlDeviceGetCount()
+
+        # 遍历每个 GPU 设备
+        for i in range(device_count):
+            # 获取 GPU 句柄
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+
+            # 获取运行在该 GPU 上的进程
+            try:
+                processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
+            except pynvml.NVMLError:
+                processes = []
+
+            # 查找当前进程
+            for p in processes:
+                if p.pid == current_pid:
+                    print(f"Process {current_pid} is running on GPU {i}")
+                    return i
+
+        print("Current process not found on any GPU")
+        return None
+    except:
+        traceback.print_exc()
+        return None
+    finally:
+        # 关闭 NVML
+        pynvml.nvmlShutdown()
+
+
+def register_all_fonts(font_dir):
+    # 遍历字体目录
+    for root, dirs, files in os.walk(font_dir):
+        for file in files:
+            # 检查文件扩展名是否为 TrueType 或 OpenType
+            if file.endswith((".ttf", ".otf")):
+                font_path = os.path.join(root, file)
+                # 提取字体名称(去掉扩展名)
+                font_name = os.path.splitext(file)[0]
+                try:
+                    # 注册字体
+                    pdfmetrics.registerFont(TTFont(font_name, font_path))
+                    print(f"Font registered: {font_name}")
+                except Exception as e:
+                    print(f"Failed to register font {font_name}: {e}")
+
+
+def ascii85_decode(data):
+    """
+    手动实现 ASCII85 解码
+    """
+    decoded = b''
+    i = 0
+    while i < len(data):
+        # ASCII85 编码以 '!' 开始,以 'z' 结束
+        if data[i] == ord('z'):
+            decoded += b'\0\0\0\0'
+            i += 1
+        else:
+            # 取 5 个字符进行解码
+            block = data[i:i+5]
+            i += 5
+            # 转换为整数值
+            value = 0
+            for c in block:
+                if ord('!') <= c <= ord('u'):
+                    value = value * 85 + (c - ord('!'))
+                elif c == ord('z'):
+                    value = 0
+                else:
+                    # 无效字符,跳过
+                    continue
+            # 转换为 4 个字节
+            bytes_value = value.to_bytes(4, byteorder='big')
+            decoded += bytes_value
+    return decoded
+
+
+def special_font_to_normal(text):
+    """
+    特殊中文转基本中文unicode
+
+    :return:
+    """
+    # print('type(extend_to_normal_dict)', type(extend_to_normal_dict), type(kangxi_to_normal_dict))
+    extend_set = set(extend_to_normal_dict.keys())
+    kangxi_set = set(kangxi_to_normal_dict.keys())
+    text_list = list(text)
+    for i, c in enumerate(text_list):
+        if c in extend_set:
+            text_list[i] = extend_to_normal_dict.get(c)
+        elif c in kangxi_set:
+            text_list[i] = kangxi_to_normal_dict.get(c)
+    text = ''.join(text_list)
+    return text
+
+
+def image_resize_by_ratio(img, max_width=1800, max_height=2600):
+    # 获取原图的宽度和高度
+    width, height = img.size
+    # print('width, height, max_width, max_height', width, height, max_width, max_height)
+
+    # 计算宽高比
+    aspect_ratio = width / height
+    # 判断哪条边超出最大值更多
+    if width > max_width and height > max_height:
+        # 计算宽度和高度超出最大值的比例
+        width_exceed_ratio = width / max_width
+        height_exceed_ratio = height / max_height
+
+        # 选择超出比例更大的边作为基准进行缩放
+        if width_exceed_ratio > height_exceed_ratio:
+            new_width = max_width
+            new_height = int(new_width / aspect_ratio)
+        else:
+            new_height = max_height
+            new_width = int(new_height * aspect_ratio)
+        # print('new_width, new_height1', new_width, new_height)
+    elif width > max_width:
+        new_width = max_width
+        new_height = int(new_width / aspect_ratio)
+        # print('new_width, new_height2', new_width, new_height)
+    elif height > max_height:
+        new_height = max_height
+        new_width = int(new_height * aspect_ratio)
+        # print('new_width, new_height3', new_width, new_height)
+    else:
+        new_width, new_height = width, height
+
+    if new_width != width or new_height != height:
+        img = img.resize((new_width, new_height), Image.LANCZOS)
+    return img
+
+
 if __name__ == "__main__":
 if __name__ == "__main__":
     # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
     # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
     # print(slash_replace(strs))
     # print(slash_replace(strs))
@@ -2572,14 +2867,27 @@ if __name__ == "__main__":
 
 
     # print(parse_yaml())
     # print(parse_yaml())
 
 
-    print(get_ip_port())
+    # print(get_ip_port())
     # set_flask_global()
     # set_flask_global()
-    print(get_all_ip())
-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], "idc"))
-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], "atc"))
-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], "ocr"))
-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], 'convert', 'MASTER'))
+    # print(get_all_ip())
+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], "idc"))
+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], "atc"))
+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], "ocr"))
+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], 'convert', 'MASTER'))
     # print(get_args_from_config(get_ip_port(), "http://127.0.0.1", "gunicorn_path"))
     # print(get_args_from_config(get_ip_port(), "http://127.0.0.1", "gunicorn_path"))
     # print(get_intranet_ip())
     # print(get_intranet_ip())
-    # _path = "C:/Users/Administrator/Downloads/3.png"
-    # remove_red_seal(cv2.imread(_path))
+
+    # ps = glob(r'D:\Project\format_conversion_maxcompute\save_b_table_pdf\*.pdf')
+    # save_dir = r'D:\Project\format_conversion_maxcompute\save_b_table_pdf'
+    # index = 0
+    # for p in ps:
+    #     save_path = f'{save_dir}/e-{index}.pdf'
+    #     page_no = int(re.split('\.|-', p)[1])
+    #     extract_one_page_pdf(p, save_path, page_no)
+    #     index += 1
+
+    # _ss = 'otr_interface:app'
+    # print(_ss[-17:-14])
+
+    _ss = '仁和坪镇杨柳池村⼈居环境整治项⽬终⽌'
+    print(special_font_to_normal(_ss))

+ 3 - 0
monitor/watch_10_minutes_process.sh

@@ -1,3 +1,6 @@
 #!/bin/bash
 #!/bin/bash
 
 
 sed -n '/2024-05-29 17:30:00/,/2024-05-29 17:40:00/p' /convert.out | grep 'is_success' | wc -l
 sed -n '/2024-05-29 17:30:00/,/2024-05-29 17:40:00/p' /convert.out | grep 'is_success' | wc -l
+
+
+sed -n '/2025-06-11 12:50:00/,/2025-06-11 13:00:00/p' /convert.out | grep 'is_success: ' | awk -F '[\\[\\] ]+' '{file_type=$(NF-2); time=$NF; map[file_type] += time; count[file_type]++} END {for (key in map) print key, "-", map[key], "-", count[key], "-", map[key]/count[key]}'

+ 8 - 26
ocr/ocr_interface.py

@@ -5,7 +5,7 @@ import multiprocessing as mp
 import socket
 import socket
 import sys
 import sys
 import os
 import os
-
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
 from PIL import Image
 from PIL import Image
 
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
@@ -91,7 +91,10 @@ def picture2text(img_data, ocr_model, only_rec=0):
         text_list = []
         text_list = []
         bbox_list = []
         bbox_list = []
         if only_rec:
         if only_rec:
-            text_list = [results[0][0]]
+            if results:
+                text_list = [results[0][0]]
+            else:
+                text_list = []
             bbox_list = []
             bbox_list = []
         else:
         else:
             for line in results:
             for line in results:
@@ -176,27 +179,6 @@ def test_ocr_model(from_remote=True):
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    test_ocr_model()
-
-#     src = """
-# data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAASwAAAAeCAYAAACWuCNnAAAE3ElEQVR42u2dQWjUQBSGi4gnEURERKQgIiIiggdP4sWDiPTgvSAK4sFDEe8iIohHEW8iUsSLFBERQaSI9CBIERGRgpQiHtpuMkl2Pa/vH2fW2ZhssttkO8v+AyHTzCT53nT23zdvZpOJIAh2TEhqNpu7VaS+THiWWq3WHuFaQV7279bW1rarWP2y5eRnYv8ZI36l1PF2u71N52P11V7Ap+QaKbwPGo3GPudv8jOx/4wLv5xw2+ajKDrhY4cT5Z23xkKJhfOG5KeSJDlEfib2H/L7Y2ysHotRp62xsp8J4/Bc2ljyM7H/kN+LFEbhGxipjU3C86LYiRy7R34m9h/ye2s0DJZtEsaK0aujFO8ZdX72H/Kz/zup3W5vsfmgGRyJ4/hkug7cSBnn3nHrkZ+J/Yf8/8agkZoXRfyOPKYoaxnnRuqHbMti5EH9N1Q4DE91KXQYHnWNJf+IxJDY/uQfFj8qQzn1tCSglLpfm8v4d2y7omJ1E4bLvWa7WKQRBjGW/JsrVmx/8lfOf/HSlbatZPO4gVTeKhUjZxx6FdOR6XPSKa8s77hc8wIM1flYfcD4VrYW6kOpJf9etre97plxzZ78lX4wc/jlG+WZy9/nNYfGX4tYsf3JXwP/f2KCfd4GY+H6bUSwsrb13+t7jaLOWaMxBkZZEAT7scfisrKChelSaaxvmE6F2ouhd8M4vObyV5ny+PHNAH6U28VxPvLX0IHZ/uSvhb8jIlZQ5GbTUvmyzkdqCSeZ/CrGoQBBsAzHAFRWsNz7ZNWT63+Se10P4uCsLbOBO5ev0FjDb66p+c0Yu4sf5Xn8Aw5/uvjtP63v62wSf2XtwPYnf438mQJjhOsJ9nLya5xkXb8sDynPe8oSpqxyd02G68FJfjHPKytovDnwgzvNL2WfcQ/TGAuVjN8z+E1+MUmSXQP884fKX0Psiu1P/lr482NYcpJUPuMMB/VJ5keKjwYZEmYNPdPlcAVdYXLXavQVwzL8thHT/HV9WMHf+W3UBtaabBZ/laLF9id/1fyd9RO9YlcG5LVZYj/Vy9MZ9LgrZJi+xB7rNoqE0F3/Iar8Aq5pl1IbxXf5G0njcI3xmwU7/Zrmz0q+8feb2P7kHyY/Mk/hHloxgAvnBshc7wqPh8BUY1pgsrynokB7nujBjRSmnWUmAYwhmh8zEln82ugUf82ehebv4xvJK/4BvlHZ/uQfGn/noIlbvbRDMkwpOrGkn0Y9Z/MC4Hkxq6LydF3X4ys71HRmJzQ/HlNh+U15h1/KDwzL4+jH/fWFf1D3ne1P/mHw60pZImIj9WYt1LH0MvpeolM09CsTQC8rWJZfjProuKVTll8HBTP4vfmwk5/85C/PLwUPpcItu/zdDAFnigLcvQRlIx5WmetjOhZKnMWvV9wKv88BafKTn/wb4O9aHBqraayhQMCsKLheVmjq8LDMbMcS8piJ0NO1WBjn8Lvupm+J/OQn/wD8GC+aRz9Muk/8w0mIzPuq0pia1UaTn/zkHx9+85D45bxAmGdGzpOf/OQfY34peI7f+kDpfDQWv0XSY1tRYROQe0V+8pN/TPn5miPyk5/8I8PP1xyRn/zkHxl+viaI/OQnP/mrMpavOSI/+ck/SomvOSI/+ck/cqLF1xyRn/zk9yLxNUfkJz/5i9If9M5atZCy5xcAAAAASUVORK5CYII=
-# """
-#
-#     image_data = src.split('data:image/png;base64,')[1]
-#
-#     # 解码 base64 字符串
-#     image_bytes = base64.b64decode(image_data)
-#
-#     # 将字节转换为图像
-#     # image = Image.open(io.BytesIO(image_bytes))
-#
-#     # image.show('img')
-#
-#     # with open(r'C:\Users\Administrator\Desktop\test_image\error16.jpg', 'rb') as f:
-#     #     image_bytes = f.read()
-#
-#     image = bytes2np(image_bytes)
-#
-#     cv2.imshow('img', image)
-#     cv2.imwrite('./1.png', image)
-#     cv2.waitKey(0)
+    # test_ocr_model()
+
+    app.run(host='127.0.0.1', port=17000, debug=False)

+ 6 - 2
ocr/ppocr/data/__init__.py

@@ -25,6 +25,9 @@ import signal
 import random
 import random
 
 
 __dir__ = os.path.dirname(os.path.abspath(__file__))
 __dir__ = os.path.dirname(os.path.abspath(__file__))
+
+from format_convert.utils import get_platform
+
 sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
 sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
 
 
 import copy
 import copy
@@ -49,8 +52,9 @@ def term_mp(sig_num, frame):
     os.killpg(pgid, signal.SIGKILL)
     os.killpg(pgid, signal.SIGKILL)
 
 
 
 
-signal.signal(signal.SIGINT, term_mp)
-signal.signal(signal.SIGTERM, term_mp)
+if get_platform() != 'Windows':
+    signal.signal(signal.SIGINT, term_mp)
+    signal.signal(signal.SIGTERM, term_mp)
 
 
 
 
 def build_dataloader(config, mode, device, logger, seed=None):
 def build_dataloader(config, mode, device, logger, seed=None):

+ 39 - 0
ocr/test_lock.py

@@ -0,0 +1,39 @@
+import multiprocessing
+import os
+import sys
+import time
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + '/../')
+from format_convert.utils import file_lock
+
+
+def run(a):
+    while True:
+        try:
+            time2 = time.time()
+            lock_file_sub = 'ocr'
+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
+            f = file_lock(lock_file)
+            print(os.getpid(),"get file_lock " + lock_file + " time ", time.time()-time2)
+            time2 = time.time()
+            time.sleep(2)
+            raise
+            print(os.getpid(), "sleep", time.time()-time2)
+
+
+        except Exception:
+            print('RuntimeError')
+        finally:
+            f.close()
+
+
+if __name__ == '__main__':
+    # 要处理的数据
+    data = [1, 2, 3]
+
+    # 创建进程池,指定进程数为 CPU 核心数
+    with multiprocessing.Pool(processes=3) as pool:
+        # 使用 map 方法分配任务并获取结果
+        results = pool.map(run, data)
+
+    # 输出结果
+    # print(results)

+ 36 - 52
ocr/tools/infer/predict_det_pytorch.py

@@ -19,7 +19,8 @@ import sys
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../../")
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../../")
 import requests
 import requests
 from format_convert import _global
 from format_convert import _global
-from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock
+from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock, \
+    get_gpu_memory_usage, get_current_process_gpu_id
 
 
 os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
 os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
 import cv2
 import cv2
@@ -120,6 +121,11 @@ class TextDetector(object):
         self.predictor.to(self.device)
         self.predictor.to(self.device)
         self.predictor.eval()
         self.predictor.eval()
 
 
+        if str(self.device) != 'cpu':
+            self.gpu_id = get_current_process_gpu_id()
+        else:
+            self.gpu_id = None
+
         # self.predictor, self.input_tensor, self.output_tensors = utility.create_predictor(
         # self.predictor, self.input_tensor, self.output_tensors = utility.create_predictor(
         #     args, 'det', logger)  # paddle.jit.load(args.det_model_dir)
         #     args, 'det', logger)  # paddle.jit.load(args.det_model_dir)
         # self.predictor.eval()
         # self.predictor.eval()
@@ -189,55 +195,44 @@ class TextDetector(object):
         shape_list = np.expand_dims(shape_list, axis=0)
         shape_list = np.expand_dims(shape_list, axis=0)
         img = img.copy()
         img = img.copy()
         starttime = time.time()
         starttime = time.time()
-
+        tensor = torch.from_numpy(img).float()
         # self.input_tensor.copy_from_cpu(img)
         # self.input_tensor.copy_from_cpu(img)
-        img = torch.from_numpy(img).float()
-        img = img.to(self.device)
-        try:
+        # if ori_im.shape[0] > 1024 and ori_im.shape[1] > 1024 and get_platform() != "Windows" and not MAX_COMPUTE:
+        if get_platform() != "Windows" and not MAX_COMPUTE and self.gpu_id is not None:
             # 加锁,防止太多大图片同时预测,爆显存
             # 加锁,防止太多大图片同时预测,爆显存
-            if ori_im.shape[0] > 1024 and ori_im.shape[1] > 1024 and get_platform() != "Windows" and not MAX_COMPUTE:
+            time2 = time.time()
+            lock_file_sub = f'ocr_{self.gpu_id}'
+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
+            f = file_lock(lock_file)
+            log("det get file_lock " + lock_file + " time " + str(time.time()-time2))
+
+            try:
                 time2 = time.time()
                 time2 = time.time()
-                lock_file_sub = 'ocr'
-                lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
-                f = file_lock(lock_file)
-                log("get file_lock " + lock_file_sub + " time " + str(time.time()-time2))
+                if str(self.device) != 'cpu':
+                    torch.cuda.empty_cache()
+                tensor = tensor.to(self.device)
                 with torch.no_grad():
                 with torch.no_grad():
-                    out = self.predictor(img)
+                    out = self.predictor(tensor)
+                log("get file_lock run det" + " time " + str(time.time()-time2))
+            except RuntimeError:
+                log("ocr/tools/infer/predict_det.py predict.run error! maybe no gpu memory!")
+                log("det predictor shrink memory! ori_im.shape " + str(ori_im.shape))
+                get_gpu_memory_usage()
+                raise RuntimeError
+            finally:
                 f.close()
                 f.close()
-            else:
-                with torch.no_grad():
-                    out = self.predictor(img)
-        except RuntimeError:
-            log("ocr/tools/infer/predict_det.py predict.run error! maybe no gpu memory!")
-            log("predictor shrink memory!")
-            # self.predictor.clear_intermediate_tensor()
-            # self.predictor.try_shrink_memory()
-            if str(self.device)!='cpu':
-                torch.cuda.empty_cache()
-                gc.collect()
-            raise RuntimeError
-
-        # outputs = []
-        # for output_tensor in self.output_tensors:
-        #     output = output_tensor.copy_to_cpu()
-        #     outputs.append(output)
-        out = out.cpu().numpy()
+                if str(self.device) != 'cpu':
+                    torch.cuda.empty_cache()
+                # gc.collect()
+        else:
+            tensor = tensor.to(self.device)
+            with torch.no_grad():
+                out = self.predictor(tensor)
 
 
+        out = out.cpu().numpy()
         preds = {}
         preds = {}
         preds['maps'] = out
         preds['maps'] = out
 
 
-        # if self.det_algorithm == "EAST":
-        #     preds['f_geo'] = outputs[0]
-        #     preds['f_score'] = outputs[1]
-        # elif self.det_algorithm == 'SAST':
-        #     preds['f_border'] = outputs[0]
-        #     preds['f_score'] = outputs[1]
-        #     preds['f_tco'] = outputs[2]
-        #     preds['f_tvo'] = outputs[3]
-        # elif self.det_algorithm == 'DB':
-        #     preds['maps'] = outputs[0]
-        # else:
-        #     raise NotImplementedError
         post_result = self.postprocess_op(preds, shape_list)
         post_result = self.postprocess_op(preds, shape_list)
         dt_boxes = post_result[0]['points']
         dt_boxes = post_result[0]['points']
         if self.det_algorithm == "SAST" and self.det_sast_polygon:
         if self.det_algorithm == "SAST" and self.det_sast_polygon:
@@ -246,17 +241,6 @@ class TextDetector(object):
             dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
             dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
         elapse = time.time() - starttime
         elapse = time.time() - starttime
 
 
-        # 释放内存
-        # print("TextDetector", self.predictor)
-        # if TextDetector.shrink_memory_count % 100 == 0:
-            # print("TextDetector shrink memory")
-        # self.predictor.clear_intermediate_tensor()
-        # self.predictor.try_shrink_memory()
-        # TextDetector.shrink_memory_count += 1
-        if str(self.device) != 'cpu':
-            torch.cuda.empty_cache()
-            # gc.collect()
-
         return dt_boxes, elapse
         return dt_boxes, elapse
 
 
 
 

+ 188 - 173
ocr/tools/infer/predict_rec_pytorch.py

@@ -37,8 +37,9 @@ import ocr.tools.infer.utility as utility
 from ocr.ppocr.postprocess import build_post_process
 from ocr.ppocr.postprocess import build_post_process
 from ocr.ppocr.utils.logging import get_logger
 from ocr.ppocr.utils.logging import get_logger
 from ocr.ppocr.utils.utility import get_image_file_list, check_and_read_gif
 from ocr.ppocr.utils.utility import get_image_file_list, check_and_read_gif
-
-from format_convert.utils import judge_error_code, log, namespace_to_dict,get_platform
+from config.max_compute_config import MAX_COMPUTE
+from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock, \
+    get_gpu_memory_usage, get_current_process_gpu_id
 from format_convert import _global
 from format_convert import _global
 
 
 import torch
 import torch
@@ -56,6 +57,8 @@ class TextRecognizer(object):
         self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
         self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
         self.character_type = args.rec_char_type
         self.character_type = args.rec_char_type
         self.rec_batch_num = args.rec_batch_num
         self.rec_batch_num = args.rec_batch_num
+        self.rec_batch_num = 16
+        print('self.rec_batch_num', self.rec_batch_num)
         self.rec_algorithm = args.rec_algorithm
         self.rec_algorithm = args.rec_algorithm
         postprocess_params = {
         postprocess_params = {
             'name': 'CTCLabelDecode',
             'name': 'CTCLabelDecode',
@@ -64,23 +67,7 @@ class TextRecognizer(object):
             # "use_space_char": args.use_space_char
             # "use_space_char": args.use_space_char
             "use_space_char": False
             "use_space_char": False
         }
         }
-        # if self.rec_algorithm == "SRN":
-        #     postprocess_params = {
-        #         'name': 'SRNLabelDecode',
-        #         "character_type": args.rec_char_type,
-        #         "character_dict_path": args.rec_char_dict_path,
-        #         "use_space_char": args.use_space_char
-        #     }
-        # elif self.rec_algorithm == "RARE":
-        #     postprocess_params = {
-        #         'name': 'AttnLabelDecode',
-        #         "character_type": args.rec_char_type,
-        #         "character_dict_path": args.rec_char_dict_path,
-        #         "use_space_char": args.use_space_char
-        #     }
         self.postprocess_op = build_post_process(postprocess_params)
         self.postprocess_op = build_post_process(postprocess_params)
-        # self.predictor, self.input_tensor, self.output_tensors = \
-        #     utility.create_predictor(args, 'rec', logger)
 
 
         rec_model_path = args.rec_model_dir
         rec_model_path = args.rec_model_dir
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -100,19 +87,22 @@ class TextRecognizer(object):
         self.predictor.to(self.device)
         self.predictor.to(self.device)
         self.predictor.eval()
         self.predictor.eval()
 
 
+        if str(self.device) != 'cpu':
+            self.gpu_id = get_current_process_gpu_id()
+        else:
+            self.gpu_id = None
+
     def resize_norm_img(self, img, max_wh_ratio):
     def resize_norm_img(self, img, max_wh_ratio):
         h, w = img.shape[:2]
         h, w = img.shape[:2]
         imgC, imgH, imgW = self.rec_image_shape
         imgC, imgH, imgW = self.rec_image_shape
         assert imgC == img.shape[2]
         assert imgC == img.shape[2]
         # print('max_wh_ratio', max_wh_ratio)
         # print('max_wh_ratio', max_wh_ratio)
+        # max_wh_ratio h是w的10倍,直接返回
         if max_wh_ratio < 0.1:
         if max_wh_ratio < 0.1:
-            # if h > imgW:
-            #     resized_image = cv2.resize(img, (w, imgW))
-            # else:
-            #     resized_image = img
-
-            # max_wh_ratio h是w的10倍,直接跳过
-            resized_w = None
+            # log('max_wh_ratio < 0.1', )
+            resized_image = img.astype('float32')
+            resized_image = resized_image.transpose((2, 0, 1)) / 255
+            return resized_image
         else:
         else:
             if self.character_type == "ch":
             if self.character_type == "ch":
                 imgW = int((32 * max_wh_ratio))
                 imgW = int((32 * max_wh_ratio))
@@ -138,186 +128,211 @@ class TextRecognizer(object):
             padding_im[:, :, 0:resized_w] = resized_image
             padding_im[:, :, 0:resized_w] = resized_image
         return padding_im
         return padding_im
 
 
-    def resize_norm_img_srn(self, img, image_shape):
-        imgC, imgH, imgW = image_shape
-
-        img_black = np.zeros((imgH, imgW))
-        im_hei = img.shape[0]
-        im_wid = img.shape[1]
-
-        if im_wid <= im_hei * 1:
-            img_new = cv2.resize(img, (imgH * 1, imgH))
-        elif im_wid <= im_hei * 2:
-            img_new = cv2.resize(img, (imgH * 2, imgH))
-        elif im_wid <= im_hei * 3:
-            img_new = cv2.resize(img, (imgH * 3, imgH))
+    def predict(self, norm_img_batch):
+        tensor = torch.from_numpy(norm_img_batch).float()
+        # if norm_img.shape[3] >= 100 and get_platform() != "Windows" and not MAX_COMPUTE:
+        if get_platform() != "Windows" and not MAX_COMPUTE:
+            # 加锁
+            time2 = time.time()
+            lock_file_sub = 'ocr'
+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
+            f = file_lock(lock_file)
+            log("rec get file_lock " + lock_file + " time " + str(time.time()-time2))
+            try:
+                time2 = time.time()
+                if str(self.device) != 'cpu':
+                    torch.cuda.empty_cache()
+                tensor = tensor.to(self.device)
+                with torch.no_grad():
+                    out = self.predictor(tensor)
+                log("get file_lock run rec" + " time " + str(time.time()-time2))
+            except RuntimeError:
+                log("ocr/tools/infer/predict_rec.py predict.run error! maybe no gpu memory!")
+                log("rec predictor shrink memory! ori_im.shape " + str(norm_img_batch.shape))
+                get_gpu_memory_usage()
+                raise RuntimeError
+            finally:
+                f.close()
+                if str(self.device) != 'cpu':
+                    torch.cuda.empty_cache()
+                gc.collect()
         else:
         else:
-            img_new = cv2.resize(img, (imgW, imgH))
-
-        img_np = np.asarray(img_new)
-        img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
-        img_black[:, 0:img_np.shape[1]] = img_np
-        img_black = img_black[:, :, np.newaxis]
-
-        row, col, c = img_black.shape
-        c = 1
-
-        return np.reshape(img_black, (c, row, col)).astype(np.float32)
-
-    def srn_other_inputs(self, image_shape, num_heads, max_text_length):
-
-        imgC, imgH, imgW = image_shape
-        feature_dim = int((imgH / 8) * (imgW / 8))
-
-        encoder_word_pos = np.array(range(0, feature_dim)).reshape(
-            (feature_dim, 1)).astype('int64')
-        gsrm_word_pos = np.array(range(0, max_text_length)).reshape(
-            (max_text_length, 1)).astype('int64')
-
-        gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
-        gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
-            [-1, 1, max_text_length, max_text_length])
-        gsrm_slf_attn_bias1 = np.tile(
-            gsrm_slf_attn_bias1,
-            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
-
-        gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
-            [-1, 1, max_text_length, max_text_length])
-        gsrm_slf_attn_bias2 = np.tile(
-            gsrm_slf_attn_bias2,
-            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
-
-        encoder_word_pos = encoder_word_pos[np.newaxis, :]
-        gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
-
-        return [
-            encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
-            gsrm_slf_attn_bias2
-        ]
-
-    def process_image_srn(self, img, image_shape, num_heads, max_text_length):
-        norm_img = self.resize_norm_img_srn(img, image_shape)
-        norm_img = norm_img[np.newaxis, :]
-
-        [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
-            self.srn_other_inputs(image_shape, num_heads, max_text_length)
-
-        gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
-        gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
-        encoder_word_pos = encoder_word_pos.astype(np.int64)
-        gsrm_word_pos = gsrm_word_pos.astype(np.int64)
-
-        return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
-                gsrm_slf_attn_bias2)
+            tensor = tensor.to(self.device)
+            with torch.no_grad():
+                out = self.predictor(tensor)
+        # logging.info("ocr model predict time - rec" + str(time.time()-start_time))
+        out = out.cpu().numpy()
+        preds = out
+        return preds
+
+    def predict_batch(self, batch_list):
+        batch_out_list = []
+        if get_platform() != "Windows" and not MAX_COMPUTE and self.gpu_id is not None:
+            # 加锁
+            time2 = time.time()
+            lock_file_sub = f'ocr_{self.gpu_id}'
+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
+            f = file_lock(lock_file)
+            log("rec get file_lock " + lock_file + " time " + str(time.time()-time2))
+            try:
+                time2 = time.time()
+                if str(self.device) != 'cpu':
+                    torch.cuda.empty_cache()
+                for sub_batch_list in batch_list:
+                    sub_batch_out = []
+                    for tensor in sub_batch_list:
+                        with torch.no_grad():
+                            out = self.predictor(tensor)
+                            out = out.cpu().numpy()
+                        sub_batch_out.append(out)
+                    # sub_batch_out = np.concatenate(sub_batch_out, axis=0)
+                    batch_out_list.append(sub_batch_out)
+                log("get file_lock run rec" + " time " + str(time.time()-time2))
+
+            except RuntimeError:
+                log("ocr/tools/infer/predict_rec.py predict.run error! maybe no gpu memory!")
+                log("rec predictor shrink memory! ori_im.shape " + str(tensor.shape))
+                get_gpu_memory_usage()
+                raise RuntimeError
+            finally:
+                f.close()
+                if str(self.device) != 'cpu':
+                    torch.cuda.empty_cache()
+        else:
+            for sub_batch_list in batch_list:
+                sub_batch_out = []
+                for tensor in sub_batch_list:
+                    # print('tensor.shape', tensor.shape)
+                    with torch.no_grad():
+                        out = self.predictor(tensor)
+                        out = out.cpu().numpy()
+                    # print('out.shape', out.shape)
+                    sub_batch_out.append(out)
+                # sub_batch_out = np.concatenate(sub_batch_out, axis=0)
+                batch_out_list.append(sub_batch_out)
+
+        # 转为numpy
+        for bi, sub_batch_out in enumerate(batch_out_list):
+            batch_out_list[bi] = np.concatenate(sub_batch_out, axis=0)
+        return batch_out_list
 
 
     def __call__(self, img_list):
     def __call__(self, img_list):
+        start_time = time.time()
+        # print('into TextRecognizer __call__')
         img_num = len(img_list)
         img_num = len(img_list)
-        # Calculate the aspect ratio of all text bars
+
+        # 过滤图片比例异常的
+        # print('rec len(img_list)', len(img_list))
+        temp_list = []
+        for img in img_list:
+            if img.shape[0] == 0 or img.shape[1] == 0 \
+                    or img.shape[0] >= 10000 or img.shape[1] >= 10000 \
+                    or img.shape[1] / img.shape[0] <= 0.5 \
+                    or img.shape[1] / img.shape[0] >= 100:
+                # print('rec img.shape[1] / img.shape[0] <= 0.5', img.shape)
+                continue
+            temp_list.append(img)
+        if not temp_list:
+            return None, 0
+        img_list = temp_list
+
+        # 按比例排序
         width_list = []
         width_list = []
         i = 0
         i = 0
         for img in img_list:
         for img in img_list:
-            # cv2.imwrite('D:/myProject/format_conversion_maxcompute/ocr/test/'+str(i)+'.jpg',img)
-            # i+=1
-            # cv2.imshow('img', img)
-            # cv2.waitKey(1000)
             width_list.append(img.shape[1] / float(img.shape[0]))
             width_list.append(img.shape[1] / float(img.shape[0]))
         # Sorting can speed up the recognition process
         # Sorting can speed up the recognition process
         indices = np.argsort(np.array(width_list))
         indices = np.argsort(np.array(width_list))
 
 
+        # 分批预测
         # rec_res = []
         # rec_res = []
         rec_res = [['', 0.0]] * img_num
         rec_res = [['', 0.0]] * img_num
         batch_num = self.rec_batch_num
         batch_num = self.rec_batch_num
         elapse = 0
         elapse = 0
+        batch_list = []
         for beg_img_no in range(0, img_num, batch_num):
         for beg_img_no in range(0, img_num, batch_num):
             end_img_no = min(img_num, beg_img_no + batch_num)
             end_img_no = min(img_num, beg_img_no + batch_num)
             norm_img_batch = []
             norm_img_batch = []
             max_wh_ratio = 0
             max_wh_ratio = 0
+            # 取这个batch中比例最大的
             for ino in range(beg_img_no, end_img_no):
             for ino in range(beg_img_no, end_img_no):
                 # h, w = img_list[ino].shape[0:2]
                 # h, w = img_list[ino].shape[0:2]
                 h, w = img_list[indices[ino]].shape[0:2]
                 h, w = img_list[indices[ino]].shape[0:2]
                 wh_ratio = w * 1.0 / h
                 wh_ratio = w * 1.0 / h
                 max_wh_ratio = max(max_wh_ratio, wh_ratio)
                 max_wh_ratio = max(max_wh_ratio, wh_ratio)
-            # print('max_wh_ratio',max_wh_ratio)
+            # print('max_wh_ratio', max_wh_ratio)
+
+            # resize image
             for ino in range(beg_img_no, end_img_no):
             for ino in range(beg_img_no, end_img_no):
-                if self.rec_algorithm != "SRN":
-                    # print('max_wh_ratio', max_wh_ratio)
-                    norm_img = self.resize_norm_img(img_list[indices[ino]],
-                                                    max_wh_ratio)
-                    # cv2.imshow('img', norm_img.transpose(1,2,0))
-                    # cv2.waitKey(1000)
-                    norm_img = norm_img[np.newaxis, :]
-                    norm_img_batch.append(norm_img)
-                else:
-                    # norm_img = self.process_image_srn(
-                    #     img_list[indices[ino]], self.rec_image_shape, 8, 25)
-                    # encoder_word_pos_list = []
-                    # gsrm_word_pos_list = []
-                    # gsrm_slf_attn_bias1_list = []
-                    # gsrm_slf_attn_bias2_list = []
-                    # encoder_word_pos_list.append(norm_img[1])
-                    # gsrm_word_pos_list.append(norm_img[2])
-                    # gsrm_slf_attn_bias1_list.append(norm_img[3])
-                    # gsrm_slf_attn_bias2_list.append(norm_img[4])
-                    # norm_img_batch.append(norm_img[0])
-                    pass
+                # print('img_list[indices[ino]].shape', img_list[indices[ino]].shape)
+                norm_img = self.resize_norm_img(img_list[indices[ino]],
+                                                max_wh_ratio)
+                # print('norm_img.shape', norm_img.shape)
+                norm_img = norm_img[np.newaxis, :]
+                norm_img_batch.append(norm_img)
+
             norm_img_batch = np.concatenate(norm_img_batch)
             norm_img_batch = np.concatenate(norm_img_batch)
             norm_img_batch = norm_img_batch.copy()
             norm_img_batch = norm_img_batch.copy()
 
 
-            if self.rec_algorithm == "SRN":
-                # starttime = time.time()
-                # encoder_word_pos_list = np.concatenate(encoder_word_pos_list)
-                # gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list)
-                # gsrm_slf_attn_bias1_list = np.concatenate(
-                #     gsrm_slf_attn_bias1_list)
-                # gsrm_slf_attn_bias2_list = np.concatenate(
-                #     gsrm_slf_attn_bias2_list)
-                #
-                # inputs = [
-                #     norm_img_batch,
-                #     encoder_word_pos_list,
-                #     gsrm_word_pos_list,
-                #     gsrm_slf_attn_bias1_list,
-                #     gsrm_slf_attn_bias2_list,
-                # ]
-                # input_names = self.predictor.get_input_names()
-                # for i in range(len(input_names)):
-                #     input_tensor = self.predictor.get_input_handle(input_names[
-                #         i])
-                #     input_tensor.copy_from_cpu(inputs[i])
-                # self.predictor.run()
-                # outputs = []
-                # for output_tensor in self.output_tensors:
-                #     output = output_tensor.copy_to_cpu()
-                #     outputs.append(output)
-                # preds = {"predict": outputs[2]}
-                pass
+            # 预测
+            # starttime = time.time()
+            # # 当图片很长时,降低batch,防止爆内存
+            # # print('norm_img_batch.shape', norm_img_batch.shape)
+            # preds = []
+            # if norm_img_batch.shape[-1] >= 400:
+            #     if norm_img_batch.shape[-1] <= 1000:
+            #         mini_batch_size = 4
+            #     elif norm_img_batch.shape[-1] <= 3000:
+            #         mini_batch_size = 2
+            #     else:
+            #         mini_batch_size = 1
+            #     for bi in range(0, norm_img_batch.shape[0], mini_batch_size):
+            #         sub_batch = norm_img_batch[bi:bi+mini_batch_size]
+            #         sub_preds = self.predict(sub_batch)
+            #         preds.append(sub_preds)
+            #         # print('type(sub_preds), sub_preds.shape', type(sub_preds), sub_preds.shape)
+            #     preds = np.concatenate(preds, axis=0)
+            # else:
+            #     preds = self.predict(norm_img_batch)
+            # # print('type(preds), preds.shape', type(preds), preds.shape)
+            #
+            # # 后处理
+            # rec_result = self.postprocess_op(preds)
+            # for rno in range(len(rec_result)):
+            #     rec_res[indices[beg_img_no + rno]] = rec_result[rno]
+            # elapse += time.time() - starttime
+
+            # 根据长度,动态batch
+            if norm_img_batch.shape[-1] >= 400:
+                if norm_img_batch.shape[-1] <= 1000:
+                    mini_batch_size = 4
+                elif norm_img_batch.shape[-1] <= 3000:
+                    mini_batch_size = 2
+                else:
+                    mini_batch_size = 1
+                sub_batch_list = []
+                for bi in range(0, norm_img_batch.shape[0], mini_batch_size):
+                    sub_batch = norm_img_batch[bi:bi+mini_batch_size]
+                    tensor = torch.from_numpy(sub_batch).float()
+                    tensor = tensor.to(self.device)
+                    sub_batch_list.append(tensor)
             else:
             else:
-                starttime = time.time()
-
                 tensor = torch.from_numpy(norm_img_batch).float()
                 tensor = torch.from_numpy(norm_img_batch).float()
-                start_time = time.time()
                 tensor = tensor.to(self.device)
                 tensor = tensor.to(self.device)
-                with torch.no_grad():
-                    out = self.predictor(tensor)
-                logging.info("ocr model predict time - rec" + str(time.time()-start_time))
-                out = out.cpu().numpy()
-                preds = out
+                sub_batch_list = [tensor]
 
 
-            # print("tools/infer/predict_rec preds", preds)
-            rec_result = self.postprocess_op(preds)
-            for rno in range(len(rec_result)):
-                # print("predict_rec", img_num, batch_num, beg_img_no,
-                #       indices[beg_img_no + rno], len(rec_res))
-                rec_res[indices[beg_img_no + rno]] = rec_result[rno]
-            elapse += time.time() - starttime
-            # 释放内存
-            # self.predictor.clear_intermediate_tensor()
-            # self.predictor.try_shrink_memory()
-
-            # gc.collect()
-            if str(self.device)!='cpu':
-                torch.cuda.empty_cache()
-            #     gc.collect()
+            batch_list.append(sub_batch_list)
+
+        # 预测
+        batch_out_list = self.predict_batch(batch_list)
+
+        # 后处理
+        for bi, out in enumerate(batch_out_list):
+            begin_img_no = bi * batch_num
+            rec_result = self.postprocess_op(out)
+            for ri in range(len(rec_result)):
+                rec_res[indices[begin_img_no + ri]] = rec_result[ri]
+        elapse += time.time() - start_time
         return rec_res, elapse
         return rec_res, elapse
 
 
 
 

+ 106 - 45
ocr/tools/infer/predict_system.py

@@ -26,17 +26,19 @@ import copy
 import numpy as np
 import numpy as np
 import time
 import time
 from PIL import Image
 from PIL import Image
+
 os.environ['FLAGS_eager_delete_tensor_gb'] = '0'
 os.environ['FLAGS_eager_delete_tensor_gb'] = '0'
 import utility as utility
 import utility as utility
 # import ocr.tools.infer.predict_rec as predict_rec
 # import ocr.tools.infer.predict_rec as predict_rec
-import ocr.tools.infer.predict_rec_pytorch as predict_rec # pytorch rec model
+import ocr.tools.infer.predict_rec_pytorch as predict_rec  # pytorch rec model
 # import ocr.tools.infer.predict_det as predict_det
 # import ocr.tools.infer.predict_det as predict_det
-import ocr.tools.infer.predict_det_pytorch as predict_det # pytorch det model
+import ocr.tools.infer.predict_det_pytorch as predict_det  # pytorch det model
 import ocr.tools.infer.predict_cls as predict_cls
 import ocr.tools.infer.predict_cls as predict_cls
 from ocr.ppocr.utils.utility import get_image_file_list, check_and_read_gif
 from ocr.ppocr.utils.utility import get_image_file_list, check_and_read_gif
 from ocr.ppocr.utils.logging import get_logger
 from ocr.ppocr.utils.logging import get_logger
 from ocr.tools.infer.utility import draw_ocr_box_txt
 from ocr.tools.infer.utility import draw_ocr_box_txt
-from format_convert.utils import has_intersection
+from format_convert.utils import has_intersection, log
+from format_convert import _global
 
 
 logger = get_logger()
 logger = get_logger()
 
 
@@ -61,27 +63,36 @@ class TextSystem(object):
         points[:, 0] = points[:, 0] - left
         points[:, 0] = points[:, 0] - left
         points[:, 1] = points[:, 1] - top
         points[:, 1] = points[:, 1] - top
         '''
         '''
-        img_crop_width = int(
-            max(
-                np.linalg.norm(points[0] - points[1]),
-                np.linalg.norm(points[2] - points[3])))
-        img_crop_height = int(
-            max(
-                np.linalg.norm(points[0] - points[3]),
-                np.linalg.norm(points[1] - points[2])))
-        pts_std = np.float32([[0, 0], [img_crop_width, 0],
-                              [img_crop_width, img_crop_height],
-                              [0, img_crop_height]])
-        M = cv2.getPerspectiveTransform(points, pts_std)
-        dst_img = cv2.warpPerspective(
-            img,
-            M, (img_crop_width, img_crop_height),
-            borderMode=cv2.BORDER_REPLICATE,
-            flags=cv2.INTER_CUBIC)
-        dst_img_height, dst_img_width = dst_img.shape[0:2]
-        # if dst_img_height * 1.0 / dst_img_width >= 1.5:
-        if dst_img_height * 1.0 / dst_img_width >= 2.0:
-            dst_img = np.rot90(dst_img)
+        # img_crop_width = int(
+        #     max(
+        #         np.linalg.norm(points[0] - points[1]),
+        #         np.linalg.norm(points[2] - points[3])))
+        # img_crop_height = int(
+        #     max(
+        #         np.linalg.norm(points[0] - points[3]),
+        #         np.linalg.norm(points[1] - points[2])))
+        # pts_std = np.float32([[0, 0], [img_crop_width, 0],
+        #                       [img_crop_width, img_crop_height],
+        #                       [0, img_crop_height]])
+        # M = cv2.getPerspectiveTransform(points, pts_std)
+        # dst_img = cv2.warpPerspective(
+        #     img,
+        #     M, (img_crop_width, img_crop_height),
+        #     borderMode=cv2.BORDER_REPLICATE,
+        #     flags=cv2.INTER_CUBIC)
+        # print('dst_img.shape', dst_img.shape)
+        #
+        # print('points', points)
+        w = abs(points[2][0] - points[0][0])
+        h = abs(points[2][1] - points[0][1])
+        dst_img = img[int(points[0][1]):int(points[0][1] + h), int(points[0][0]):int(points[0][0] + w), :]
+        # print('dst_img.shape2', dst_img.shape)
+        # cv2.imshow('dst_img', dst_img)
+        # cv2.waitKey(0)
+        # dst_img_height, dst_img_width = dst_img.shape[0:2]
+        # # if dst_img_height * 1.0 / dst_img_width >= 1.5:
+        # if dst_img_height * 1.0 / dst_img_width >= 2.0:
+        #     dst_img = np.rot90(dst_img)
         return dst_img
         return dst_img
 
 
     def print_draw_crop_rec_res(self, img_crop_list, rec_res):
     def print_draw_crop_rec_res(self, img_crop_list, rec_res):
@@ -91,6 +102,7 @@ class TextSystem(object):
             logger.info(bno, rec_res[bno])
             logger.info(bno, rec_res[bno])
 
 
     def __call__(self, img):
     def __call__(self, img):
+        # print('into TextSystem __call__')
         # cv2.imshow('img',img)
         # cv2.imshow('img',img)
         # cv2.waitKey(0)
         # cv2.waitKey(0)
         ori_im = img.copy()
         ori_im = img.copy()
@@ -98,15 +110,65 @@ class TextSystem(object):
         logger.info("dt_boxes num : {}, elapse : {}".format(
         logger.info("dt_boxes num : {}, elapse : {}".format(
             len(dt_boxes), elapse))
             len(dt_boxes), elapse))
         if dt_boxes is None:
         if dt_boxes is None:
-            return None, None
-        img_crop_list = []
+            return [], []
 
 
-        dt_boxes = sorted_boxes(dt_boxes)
+        temp_list = []
+        # print('dt_boxes', type(dt_boxes))
+        # print('dt_boxes.shape', dt_boxes.shape)
+        # 过滤一些比例离谱的box
+        for b in dt_boxes:
+            w = b[2][0] - b[0][0]
+            h = b[2][1] - b[0][1]
+            if h == 0 or w == 0 \
+                    or h >= 10000 or w >= 10000 \
+                    or w / h <= 0.5 or w / h >= 100:
+                continue
+            temp_list.append(b)
+
+        if not temp_list:
+            return [], []
+        dt_boxes = np.array(temp_list)
+        # print('dt_boxes.shape2', dt_boxes.shape)
+
+        # show
+        # for b in dt_boxes:
+        #     p1 = [int(x) for x in b[0]]
+        #     p2 = [int(x) for x in b[2]]
+        #     cv2.rectangle(img, p1, p2, (0, 0, 255))
+        # cv2.namedWindow('img', cv2.WINDOW_NORMAL)
+        # cv2.imshow('img', img)
+        # cv2.waitKey(0)
+
+        # # 检测过多单字box,返回None
+        # if len(dt_boxes) >= 150:
+        #     short_box_cnt = 0
+        #     long_box_cnt = 0
+        #     for b in dt_boxes:
+        #         w = b[2][0] - b[0][0]
+        #         h = b[2][1] - b[0][1]
+        #         if w / h < 1.3:
+        #             short_box_cnt += 1
+        #         if w / h >= 3:
+        #             long_box_cnt += 1
+        #         print('dt_boxes', w, h, round(w/h, 3))
+        #     # print('short_box_cnt, len(dt_boxes)', short_box_cnt, len(dt_boxes))
+        #     log('short_box_cnt, long_box_cnt, len(dt_boxes) ' + str([short_box_cnt, long_box_cnt, len(dt_boxes)]))
+        #     if short_box_cnt >= 2/3 * len(dt_boxes) and long_box_cnt < 10:
+        #         # print('short_box_cnt >= 2/3 * len(dt_boxes), return None')
+        #         log('short_box_cnt >= 2/3 * len(dt_boxes), return None. ' + str([short_box_cnt, long_box_cnt, len(dt_boxes)]))
+        #         return [], []
 
 
+        img_crop_list = []
+        dt_boxes = sorted_boxes(dt_boxes)
         for bno in range(len(dt_boxes)):
         for bno in range(len(dt_boxes)):
             tmp_box = copy.deepcopy(dt_boxes[bno])
             tmp_box = copy.deepcopy(dt_boxes[bno])
             img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
             img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
             img_crop_list.append(img_crop)
             img_crop_list.append(img_crop)
+        # print('system len(img_crop_list)', len(img_crop_list))
+        # for img in img_crop_list:
+        #     if img.shape[1] / img.shape[0] <= 0.5:
+        # print('system img.shape[1] / img.shape[0] <= 0.5', img.shape)
+
         if self.use_angle_cls:
         if self.use_angle_cls:
             img_crop_list, angle_list, elapse = self.text_classifier(
             img_crop_list, angle_list, elapse = self.text_classifier(
                 img_crop_list)
                 img_crop_list)
@@ -131,6 +193,7 @@ class TextSystem(object):
                 filter_rec_res.append(rec_reuslt)
                 filter_rec_res.append(rec_reuslt)
         return filter_boxes, filter_rec_res
         return filter_boxes, filter_rec_res
 
 
+
 def boxex_points_fixup(dt_boxes):
 def boxex_points_fixup(dt_boxes):
     # 检查框全部转换为矩形
     # 检查框全部转换为矩形
     # for i in range(len(dt_boxes)):
     # for i in range(len(dt_boxes)):
@@ -143,39 +206,37 @@ def boxex_points_fixup(dt_boxes):
     #     y_min = min(y_list)
     #     y_min = min(y_list)
     #     dt_boxes[i] = np.array([[x_min,y_min],[x_max,y_min],[x_max,y_max],[x_min,y_max]])
     #     dt_boxes[i] = np.array([[x_min,y_min],[x_max,y_min],[x_max,y_max],[x_min,y_max]])
 
 
-
     for i in range(len(dt_boxes)):
     for i in range(len(dt_boxes)):
         box1 = dt_boxes[i]
         box1 = dt_boxes[i]
         box1_point3 = box1[2]
         box1_point3 = box1[2]
-        box1_point4 = box1[3] # 四边形底边的两点坐标
-        bottom_line = (min(box1_point3[0],box1_point4[0]),max(box1_point3[0],box1_point4[0]))
-        bottom_line_len = abs(bottom_line[1]-bottom_line[0])
+        box1_point4 = box1[3]  # 四边形底边的两点坐标
+        bottom_line = (min(box1_point3[0], box1_point4[0]), max(box1_point3[0], box1_point4[0]))
+        bottom_line_len = abs(bottom_line[1] - bottom_line[0])
 
 
-        for j in range(i+1,len(dt_boxes)):
+        for j in range(i + 1, len(dt_boxes)):
             box2 = dt_boxes[j]
             box2 = dt_boxes[j]
             box2_point1 = box2[0]
             box2_point1 = box2[0]
-            box2_point2 = box2[1] # 四边形顶边的两点坐标
+            box2_point2 = box2[1]  # 四边形顶边的两点坐标
             top_line = (min(box2_point1[0], box2_point2[0]), max(box2_point1[0], box2_point2[0]))
             top_line = (min(box2_point1[0], box2_point2[0]), max(box2_point1[0], box2_point2[0]))
-            top_line_len = abs(top_line[1]-top_line[0])
+            top_line_len = abs(top_line[1] - top_line[0])
             if has_intersection(box1, box2):  # 四边形框是否有交集
             if has_intersection(box1, box2):  # 四边形框是否有交集
-                if not (min(top_line)>=max(bottom_line) or min(bottom_line)>=max(top_line)):  # x轴方向上有交集
+                if not (min(top_line) >= max(bottom_line) or min(bottom_line) >= max(top_line)):  # x轴方向上有交集
                     # 求重合部分y中间值
                     # 求重合部分y中间值
                     mid_y = ((box2_point1[1] + box2_point2[1]) / 2 + (box1_point3[1] + box1_point4[1]) / 2) // 2
                     mid_y = ((box2_point1[1] + box2_point2[1]) / 2 + (box1_point3[1] + box1_point4[1]) / 2) // 2
                     if not mid_y:
                     if not mid_y:
                         continue
                         continue
-                    max_line_len = max(bottom_line_len,top_line_len)
+                    max_line_len = max(bottom_line_len, top_line_len)
                     cross_line_len = bottom_line_len + top_line_len - \
                     cross_line_len = bottom_line_len + top_line_len - \
-                                     (max(bottom_line[1],bottom_line[0],top_line[1],top_line[0]) - min(bottom_line[1],bottom_line[0],top_line[1],top_line[0]))
+                                     (max(bottom_line[1], bottom_line[0], top_line[1], top_line[0]) - min(
+                                         bottom_line[1], bottom_line[0], top_line[1], top_line[0]))
                     # print(cross_line_len,max_line_len,cross_line_len/max_line_len)
                     # print(cross_line_len,max_line_len,cross_line_len/max_line_len)
-                    if cross_line_len/max_line_len>=0.55: # 重合比例
-                        box1[2] = [box1_point3[0],mid_y]
-                        box1[3] = [box1_point4[0],mid_y]
-                        box2[0] = [box2_point1[0],mid_y]
-                        box2[1] = [box2_point2[0],mid_y]
+                    if cross_line_len / max_line_len >= 0.55:  # 重合比例
+                        box1[2] = [box1_point3[0], mid_y]
+                        box1[3] = [box1_point4[0], mid_y]
+                        box2[0] = [box2_point1[0], mid_y]
+                        box2[1] = [box2_point2[0], mid_y]
                         break
                         break
 
 
-
-
     return dt_boxes
     return dt_boxes
 
 
 
 
@@ -247,4 +308,4 @@ def main(args):
 if __name__ == "__main__":
 if __name__ == "__main__":
     main(utility.parse_args())
     main(utility.parse_args())
 
 
-    pass
+    pass

+ 1 - 0
start_and_stop/kill_convert.sh

@@ -0,0 +1 @@
+kill -9 $(lsof -i:15010|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')

+ 256 - 30
tika_/tika_interface.py

@@ -1,3 +1,5 @@
+import base64
+import io
 import json
 import json
 import os
 import os
 import re
 import re
@@ -7,8 +9,11 @@ import traceback
 from glob import glob
 from glob import glob
 
 
 import psutil
 import psutil
+from PIL import Image
+from bs4 import BeautifulSoup
 
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
+from config.max_compute_config import MAX_COMPUTE
 _dir = os.path.abspath(os.path.dirname(__file__))
 _dir = os.path.abspath(os.path.dirname(__file__))
 os.environ["TIKA_SERVER_JAR"] = _dir + "/files/tika-server.jar"
 os.environ["TIKA_SERVER_JAR"] = _dir + "/files/tika-server.jar"
 os.environ["TIKA_LOG_PATH"] = _dir + "/log/"
 os.environ["TIKA_LOG_PATH"] = _dir + "/log/"
@@ -16,12 +21,19 @@ os.environ["TIKA_PATH"] = _dir + "/files/"
 os.environ["TIKA_LOG_FILE"] = "tika.log"
 os.environ["TIKA_LOG_FILE"] = "tika.log"
 
 
 from format_convert import _global
 from format_convert import _global
-from format_convert.utils import log, request_post, dynamic_get_port
+from format_convert.utils import log, request_post, dynamic_get_port, get_platform
 import tika
 import tika
 from tika import parser, config
 from tika import parser, config
 from tika.tika import runCommand
 from tika.tika import runCommand
 from flask import Flask, request
 from flask import Flask, request
 
 
+if get_platform() == "Windows":
+    FROM_REMOTE = False
+else:
+    FROM_REMOTE = True
+
+if MAX_COMPUTE:
+    FROM_REMOTE = False
 
 
 # 接口配置
 # 接口配置
 app = Flask(__name__)
 app = Flask(__name__)
@@ -46,18 +58,18 @@ def _tika():
         _md5 = request.form.get("md5")
         _md5 = request.form.get("md5")
         _global.update({"md5": _md5})
         _global.update({"md5": _md5})
 
 
-        html = tika_interface(data).get('html')
-        return json.dumps({"html": html})
+        html = tika_interface(data).get('data')
+        return json.dumps({"data": html})
     except TimeoutError:
     except TimeoutError:
-        return json.dumps({"html": [-5]})
+        return json.dumps({"data": [-5]})
     except:
     except:
         traceback.print_exc()
         traceback.print_exc()
-        return json.dumps({"html": [-1]})
+        return json.dumps({"data": [-1]})
     finally:
     finally:
         log("tika interface finish time " + str(time.time()-start_time))
         log("tika interface finish time " + str(time.time()-start_time))
 
 
 
 
-def tika_interface(_path, show=1):
+def tika_interface(_path, show=0):
     try:
     try:
         # apache tika服务器 提取
         # apache tika服务器 提取
         # text = runCommand('parse', 'all', _path, '9998', outDir='./files/')
         # text = runCommand('parse', 'all', _path, '9998', outDir='./files/')
@@ -67,7 +79,8 @@ def tika_interface(_path, show=1):
         if globals().get(key):
         if globals().get(key):
             port = globals().get(key)
             port = globals().get(key)
         else:
         else:
-            port = dynamic_get_port(port)
+            if FROM_REMOTE:
+                port = dynamic_get_port(port)
             if port is None:
             if port is None:
                 kill_tika_java_server()
                 kill_tika_java_server()
                 # return {"html": [-19]}
                 # return {"html": [-19]}
@@ -76,31 +89,104 @@ def tika_interface(_path, show=1):
         url = 'http://localhost:' + str(port)
         url = 'http://localhost:' + str(port)
         log('tika ' + key + ' port: ' + str(port))
         log('tika ' + key + ' port: ' + str(port))
         parsed = parser.from_file(_path, xmlContent=True, serverEndpoint=url)
         parsed = parser.from_file(_path, xmlContent=True, serverEndpoint=url)
-        html = parsed.get('content')
-
-        # 处理html
-        html = html.split('\n')
-        temp_list = []
-        for line in html:
-            if '<meta' in line:
-                continue
-            temp_list.append(line)
-        html = temp_list
-        if len(html) <= 4:
-            return {"html": ''}
-
-        html = html[:2] + ['<meta charset="UTF-8">'] + html[2:]
-        html = '\n'.join(html)
-        html = re.sub('<table>', '<table border="1">', html)
-        html = re.sub(' class="正文"', '', html)
+        # print('parsed', parsed)
+        html = parsed.get('content', '')
 
 
+        # 提取html各种元素,其中图片只是一个映射
+        soup = BeautifulSoup(html, 'lxml')
+        tag_list = collect_soup_elements(soup)
         if show:
         if show:
-            with open(_dir + '/doc.html', 'w', encoding='utf-8') as f:
-                f.write(html)
+            print('tag_list0', tag_list)
+
+        if not tag_list:
+            return {"data": tag_list}
+
+        # docx不是二进制,不能直接读二进制图片
+        if _path[-3:] == 'doc':
+            # 直接从二进制提取图片,保存在同一目录下
+            ss = re.split('[/\\\]', _path)
+            save_dir = os.sep.join(ss[:-1])
+            file_name = re.split('\.', ss[-1])[0]
+            if show:
+                print('save_dir', save_dir)
+                print('file_name', file_name)
+            image_path_dict = extract_images_from_doc(_path, save_dir)
+
+            if show:
+                print('image_path_dict', image_path_dict)
+
+            # embedded_images = re.findall(r'embedded:image[^"]+', html)
+            match_flag = 1
+            for tag in tag_list:
+                tag_name, value = tag
+                if tag_name != 'img':
+                    continue
+                # 提取图片文件名
+                image_name = file_name + '_' + re.sub('image', '', value)
+                if show:
+                    print('image_name', image_name)
+                # 保证所有image映射都对得上
+                real_image_path = image_path_dict.get(image_name)
+                if real_image_path is None:
+                    match_flag = 0
+                    break
+                else:
+                    tag[1] = real_image_path
+            if show:
+                print('match_flag', match_flag)
+
+            if match_flag:
+                # 图片数量能对上,则是正确的
+                pass
+            else:
+                # 图片对不上,则删除所有图片类型的tag
+                temp_list = []
+                for tag_name, value in tag_list:
+                    if tag_name == 'img':
+                        continue
+                    temp_list.append([tag_name, value])
+                tag_list = temp_list
+
+        elif _path[-4:] == 'docx':
+            temp_list = []
+            for tag_name, value in tag_list:
+                if tag_name == 'img':
+                    continue
+                temp_list.append([tag_name, value])
+            tag_list = temp_list
+
+
+        # # 处理html
+        # html = html.split('\n')
+        # temp_list = []
+        # for line in html:
+        #     if '<meta' in line:
+        #         continue
+        #     temp_list.append(line)
+        # html = temp_list
+        # if len(html) <= 4:
+        #     return {"html": ''}
+        #
+        # html = html[:2] + ['<meta charset="UTF-8">'] + html[2:]
+        # html = '\n'.join(html)
+        # html = re.sub('<table>', '<table border="1">', html)
+        # html = re.sub(' class="正文"', '', html)
+        #
+        # if show:
+        #     with open(_dir + '/doc.html', 'w', encoding='utf-8') as f:
+        #         f.write(html)
+    # except:
+    #     traceback.print_exc()
+    #     return {"html": [-17]}
+    # return {"html": html}
+
+        if show:
+            print('tag_list final', tag_list)
+
     except:
     except:
         traceback.print_exc()
         traceback.print_exc()
-        return {"html": [-17]}
-    return {"html": html}
+        return {"data": [-17]}
+    return {"data": tag_list}
 
 
 
 
 def kill_tika_java_server():
 def kill_tika_java_server():
@@ -122,6 +208,139 @@ def kill_tika_java_server():
             os.system(comm)
             os.system(comm)
 
 
 
 
+def extract_images_from_doc(doc_file_path, output_folder):
+    # 定义图片格式相关的标志
+    image_signatures = {
+        'jpg': (b'\xFF\xD8', b'\xFF\xD9'),
+        'png': (b'\x89PNG', b'\x49\x45\x4E\x44\xAE\x42\x60\x82')
+    }
+
+    file_name = re.split('[/\\\.]', doc_file_path)[-2]
+
+    # 读取.doc文件
+    with open(doc_file_path, 'rb') as doc_file:
+        doc_data = doc_file.read()
+
+    output_file_path_dict = {}
+    # 查找并提取所有图片
+    for img_format, (start_sig, end_sig) in image_signatures.items():
+        start_index = 0
+        image_count = 1
+        while True:
+            # 查找图片起始位置
+            start_index = doc_data.find(start_sig, start_index)
+            if start_index == -1:
+                break
+
+            # 查找图片结束位置
+            end_index = doc_data.find(end_sig, start_index)
+            if end_index == -1:
+                break
+
+            # 提取图片数据
+            end_index += len(end_sig)  # 包含结束标志
+            image_data = doc_data[start_index:end_index]
+
+            # 保存图片
+            # image_count = len([f for f in os.listdir(output_folder) if f.endswith(f'.{img_format}')])
+            image_name = f'{file_name}_{image_count}.{img_format}'
+            image_path = os.path.join(output_folder, image_name)
+            with open(image_path, 'wb') as img_file:
+                img_file.write(image_data)
+            print(f'Saved {img_format} image to {image_path}')
+            output_file_path_dict[image_name] = image_path
+
+            # 继续查找下一个图片
+            start_index = end_index
+            image_count += 1
+    return output_file_path_dict
+
+
+def is_image_valid(image_path):
+    try:
+        # 尝试打开图片
+        with Image.open(image_path) as img:
+            # 如果图片可以打开并且没有问题,则 True返回
+            img.load()
+            return True
+    except:
+        # 如果出现异常,则返回 False
+        return False
+
+
+def is_image_data_valid(image_data):
+    """
+    判断图片数据流是否可以正常打开
+
+    Args:
+        image_data (bytes): 图片数据流
+
+    Returns:
+        bool: 如果图片数据流可以正常打开,则返回True,否则返回False
+    """
+    try:
+        # 将图片数据流转换为文件类对象
+        image_file = io.BytesIO(image_data)
+        # 尝试打开图片
+        with Image.open(image_file) as img:
+            # 如果图片可以打开并且没有问题,则返回True
+            img.load()
+            return True
+    except:
+        # 如果出现异常,则返回False
+        return False
+
+
+def collect_soup_elements(soup):
+    # elements = []
+    # # print('tags', tags)
+    # for tag in tags:
+    #     for element in tag.children:
+    #         print('element', element)
+    #         if element.name == 'img':
+    #             # 提取<img>标签的alt属性
+    #             alt_value = element.get('alt')
+    #             print(f"Image: {alt_value}")
+    #             elements.append(['img', alt_value])
+    #         elif element.name == 'table':
+    #             elements.append(['table', element])
+    #         elif element.string and element.string.strip():
+    #             # 提取文本内容
+    #             text = element.string.strip()
+    #             print(f"Text: {text}")
+    #             elements.append(['text', text])
+
+    table_tags = soup.find_all('table')
+    for table in table_tags:
+        table['border'] = "1"
+
+    elements = []
+    # 遍历所有标签
+    for element in soup.body.descendants:
+        if element.name == 'p':
+            # 提取文本
+            text = element.get_text(strip=True)
+            if text:
+                elements.append(['text', text])
+        elif element.name == 'img':
+            # 提取图片alt
+            alt = element.get('alt')
+            elements.append(['img', alt])
+        elif element.name == 'table':
+            # 提取表格数据
+            # table_data = []
+            # for row in element.find_all('tr'):
+            #     row_data = []
+            #     for cell in row.find_all('td'):
+            #         cell_text = cell.get_text(strip=True)
+            #         row_data.append(cell_text)
+            #     table_data.append(row_data)
+            for p_tag in element.find_all('p'):
+                p_tag.unwrap()
+            elements.append(['table', str(element)])
+    return elements
+
+
 def test_interface():
 def test_interface():
     # paths = glob("C:/Users/Administrator/Downloads/1716253106319.doc")
     # paths = glob("C:/Users/Administrator/Downloads/1716253106319.doc")
     paths = ["files/1716253106319.doc"]
     paths = ["files/1716253106319.doc"]
@@ -153,6 +372,13 @@ if __name__ == "__main__":
     #     # _p = "C:/Users/Administrator/Downloads/1716253106319.doc"
     #     # _p = "C:/Users/Administrator/Downloads/1716253106319.doc"
     #     tika_interface(_p)
     #     tika_interface(_p)
 
 
-    # app.run(host='0.0.0.0', port=5000)
+    # app.run(host='0.0.0.0', port=16050)
     # test_interface()
     # test_interface()
-    kill_tika_java_server()
+    # kill_tika_java_server()
+
+    # p = "C:/Users/Administrator/Desktop/test_wps/error1.wps"
+    # extract_images_from_doc(p, '.')
+
+    _p = "C:/Users/Administrator/Desktop/test_wps/error1.wps"
+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
+    c = tika_interface(_p)