Quellcode durchsuchen

1. 新增wps类型
2. 新增ofd类型
3. 新增两列无边框表格识别
4. 修复ocr爆显存
5. pdf处理速度优化
6. 特殊康熙字体处理
7. 新增监控平均处理时间

fangjiasheng vor 9 Monaten
Ursprung
Commit
ab202ff1fc
58 geänderte Dateien mit 12210 neuen und 640 gelöschten Zeilen
  1. 2452 16
      botr/extract_table.py
  2. 5 0
      botr/utils.py
  3. 1 1
      config/interface_new.yml
  4. 287 118
      format_convert/convert.py
  5. 94 22
      format_convert/convert_doc.py
  6. 205 18
      format_convert/convert_docx.py
  7. 819 25
      format_convert/convert_image.py
  8. 26 9
      format_convert/convert_need_interface.py
  9. 75 0
      format_convert/convert_ofd.py
  10. 75 0
      format_convert/convert_ofd_test.py
  11. 352 51
      format_convert/convert_pdf.py
  12. 30 36
      format_convert/convert_test.py
  13. 91 4
      format_convert/convert_tree.py
  14. 61 0
      format_convert/convert_wps.py
  15. 6 0
      format_convert/easyofd/easyofd/__init__.py
  16. 474 0
      format_convert/easyofd/easyofd/chinese_characters.txt
  17. 23 0
      format_convert/easyofd/easyofd/draw/__init__.py
  18. 290 0
      format_convert/easyofd/easyofd/draw/draw_ofd.py
  19. 1178 0
      format_convert/easyofd/easyofd/draw/draw_pdf.py
  20. 113 0
      format_convert/easyofd/easyofd/draw/find_seal_img.py
  21. 216 0
      format_convert/easyofd/easyofd/draw/font_tools.py
  22. 666 0
      format_convert/easyofd/easyofd/draw/ofdtemplate.py
  23. 966 0
      format_convert/easyofd/easyofd/draw/pdf_parse.py
  24. BIN
      format_convert/easyofd/easyofd/draw/simsun.ttc
  25. 301 0
      format_convert/easyofd/easyofd/ofd.py
  26. 37 0
      format_convert/easyofd/easyofd/parser_ofd/__init__.py
  27. 145 0
      format_convert/easyofd/easyofd/parser_ofd/file_annotation_parser.py
  28. 7 0
      format_convert/easyofd/easyofd/parser_ofd/file_attachment_parser.py
  29. 140 0
      format_convert/easyofd/easyofd/parser_ofd/file_content_parser.py
  30. 7 0
      format_convert/easyofd/easyofd/parser_ofd/file_customtag_parser.py
  31. 104 0
      format_convert/easyofd/easyofd/parser_ofd/file_deal.py
  32. 99 0
      format_convert/easyofd/easyofd/parser_ofd/file_doc_parser.py
  33. 36 0
      format_convert/easyofd/easyofd/parser_ofd/file_docres_parser.py
  34. 41 0
      format_convert/easyofd/easyofd/parser_ofd/file_ofd_parser.py
  35. 58 0
      format_convert/easyofd/easyofd/parser_ofd/file_parser.py
  36. 63 0
      format_convert/easyofd/easyofd/parser_ofd/file_parser_base.py
  37. 52 0
      format_convert/easyofd/easyofd/parser_ofd/file_publicres_parser.py
  38. 63 0
      format_convert/easyofd/easyofd/parser_ofd/file_signature_parser.py
  39. 100 0
      format_convert/easyofd/easyofd/parser_ofd/find_seal_img.py
  40. 35 0
      format_convert/easyofd/easyofd/parser_ofd/img_deal.py
  41. 607 0
      format_convert/easyofd/easyofd/parser_ofd/ofd_parser.py
  42. 31 0
      format_convert/easyofd/easyofd/parser_ofd/parameter_parser.py
  43. 61 0
      format_convert/easyofd/easyofd/parser_ofd/path_parser.py
  44. 7 0
      format_convert/easyofd/easyofd/template_ofd/__init__.py
  45. 53 0
      format_convert/font_map/extend_to_normal_dict.txt
  46. 214 0
      format_convert/font_map/kangxi_to_normal
  47. 154 0
      format_convert/font_map/kangxi_to_normal_dict.txt
  48. 327 0
      format_convert/ofd/ofd_parser.py
  49. 320 12
      format_convert/utils.py
  50. 3 0
      monitor/watch_10_minutes_process.sh
  51. 8 26
      ocr/ocr_interface.py
  52. 6 2
      ocr/ppocr/data/__init__.py
  53. 39 0
      ocr/test_lock.py
  54. 36 52
      ocr/tools/infer/predict_det_pytorch.py
  55. 188 173
      ocr/tools/infer/predict_rec_pytorch.py
  56. 106 45
      ocr/tools/infer/predict_system.py
  57. 1 0
      start_and_stop/kill_convert.sh
  58. 256 30
      tika_/tika_interface.py

+ 2452 - 16
botr/extract_table.py

@@ -1,29 +1,37 @@
+import copy
+import math
+import os
 import re
 import time
 import traceback
+from glob import glob
+import numpy as np
 import cv2
+import wcwidth
 from pdfminer.layout import LTLine
 # from botr.nsp.predict import nsp_predict
+from sklearn.cluster import KMeans
+
 from botr.rules.get_table_by_rules import get_table_by_rule
 from botr.utils import line_iou, get_table_iou
 from format_convert.convert_need_interface import from_yolo_interface
-from format_convert.utils import log, np2bytes
+from format_convert.utils import log, np2bytes, text_bbox_to_lt, pil_resize, memory_decorator
 
 
 def b_table_process(list_line, list_text_boxes, list_cell, table_location):
     def merge_textbox(textbox_list, in_objs):
         delete_obj = []
         threshold = 5
-        textbox_list.sort(key=lambda x:x.bbox[0])
+        textbox_list.sort(key=lambda x: x.bbox[0])
         for k in range(len(textbox_list)):
             tb1 = textbox_list[k]
             if tb1 not in in_objs and tb1 not in delete_obj:
-                for m in range(k+1, len(textbox_list)):
+                for m in range(k + 1, len(textbox_list)):
                     tb2 = textbox_list[m]
                     if tb2 in in_objs:
                         continue
-                    if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
-                            and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
+                    if abs(tb1.bbox[1] - tb2.bbox[1]) <= threshold \
+                            and abs(tb1.bbox[3] - tb2.bbox[3]) <= threshold:
                         if tb1.bbox[0] <= tb2.bbox[0]:
                             tb1.text = tb1.text + tb2.text
                         else:
@@ -35,6 +43,7 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
             if _obj in textbox_list:
                 textbox_list.remove(_obj)
         return textbox_list
+
     try:
         if list_line:
             from format_convert.convert_tree import TableLine
@@ -55,7 +64,7 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
             current_y = area_list_text_boxes[0].bbox[1]
             current_y2 = area_list_text_boxes[0].bbox[3]
             # threshold = 2.
-            threshold = max(2., 1/3 * abs(current_y2 - current_y))
+            threshold = max(2., 1 / 3 * abs(current_y2 - current_y))
             for t_b in area_list_text_boxes:
                 bbox = t_b.bbox
                 if current_y - threshold <= bbox[1] <= current_y + threshold:
@@ -69,6 +78,11 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
             obj_in_table = []
             table_dict = {'bbox': table_location}
             row_list = []
+
+            # yolo检测出的表格,忽略两列的,因为已经补充了两列的新规则 250529
+            if list_cell and len(list_cell[0]) == 2:
+                return list_text_boxes, [], set()
+
             for row in list_cell:
                 col_list = []
                 for col in row:
@@ -112,17 +126,19 @@ def get_text_box_obj(_text_list, _bbox_list):
     return _text_box_list
 
 
-def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
+def get_table(img, table_list, text_list, bbox_list, text_box_list, from_pdf=False, show=0):
     log('start')
     # 检测无边框表格
     start_time_all = time.time()
     start_time = time.time()
     img_bytes = np2bytes(img)
     b_table_list = from_yolo_interface(img_bytes)
-    log('yolo detect cost: ' + str(time.time()-start_time))
+    log('yolo detect cost: ' + str(time.time() - start_time))
     b_table_list = b_table_list[0]
     if not b_table_list:
         log('detect not b_table_list')
+        if from_pdf:
+            save_b_table(img)
         return [], [], []
 
     # if show:
@@ -156,8 +172,9 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
         b_loc = [min_x, min_y, max_x, max_y, b_table[4]]
         inter_flag = False
         for table in table_list:
-            loc = table.get('bbox')
-            rows = table.get('table')
+            # loc = table.get('bbox')
+            loc = table.bbox
+            # rows = table.get('table')
             iou = line_iou([[0, loc[1]], [0, loc[3]]], [[0, b_loc[1]], [0, b_loc[3]]], axis=1)
             if iou > 0.3:
                 # if len(rows) <= 1:
@@ -190,7 +207,7 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
             if b_loc1 in used_b_loc:
                 continue
             inter_flag = False
-            for j in range(i+1, len(b_table_location_list)):
+            for j in range(i + 1, len(b_table_location_list)):
                 b_loc2 = b_table_location_list[j]
                 iou = line_iou([[0, b_loc1[1]], [0, b_loc1[3]]], [[0, b_loc2[1]], [0, b_loc2[3]]], axis=1)
                 if show:
@@ -230,7 +247,8 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
 
         # 根据ocr bbox,规则生成表格线
         start_time = time.time()
-        line_list, cell_list, table_location, bbox_text_dict = get_table_by_rule(img, area_text_list, area_bbox_list, b_loc, show=show)
+        line_list, cell_list, table_location, bbox_text_dict = get_table_by_rule(img, area_text_list, area_bbox_list,
+                                                                                 b_loc, show=show)
         if not table_location:
             log('get_table_by_rule not table_location')
             continue
@@ -240,14 +258,15 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
             area_bbox_list.append(eval(key))
             area_text_list.append(bbox_text_dict.get(key))
         b_text_box_list = get_text_box_obj(area_text_list, area_bbox_list)
-        log('get_table_by_rule cost: ' + str(time.time()-start_time))
+        log('get_table_by_rule cost: ' + str(time.time() - start_time))
 
         # 根据表格线生成单元格
         start_time = time.time()
-        b_text_box_list, _table_list, _obj_in_table_list = b_table_process(line_list, b_text_box_list, cell_list, table_location)
+        b_text_box_list, _table_list, _obj_in_table_list = b_table_process(line_list, b_text_box_list, cell_list,
+                                                                           table_location)
         table_list += _table_list
         obj_in_table_list += _obj_in_table_list
-        log('b_table_process cost: ' + str(time.time()-start_time))
+        log('b_table_process cost: ' + str(time.time() - start_time))
 
         # if not table_list:
         #     log('table_process not table_list')
@@ -317,4 +336,2421 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
         # _table_list[0]['table'] = new_table
 
     log('get_table finish ' + str(time.time() - start_time_all))
-    return text_box_list, table_list, obj_in_table_list
+    return text_box_list, table_list, obj_in_table_list
+
+
+def save_b_table(image_np):
+    _start_time = time.time()
+    _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table_not_detect'
+    # _path = 'D:/Project/format_conversion_maxcompute/save_b_table_not_detect'
+    max_index = 20000
+    if os.path.exists(_path):
+        file_list = glob(_path + '/*')
+        if file_list:
+            file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
+            file_index_list.sort(key=lambda x: x)
+            index = file_index_list[-1] + 1
+        else:
+            index = 0
+        if index > max_index:
+            return
+
+        # 文件md5
+        from format_convert import _global
+        _md5 = _global.get("md5")
+
+        _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png'
+        cv2.imwrite(_image_path, image_np)
+        log('save yolo not detect b_table image success!')
+
+
+@memory_decorator
+def get_b_table_by_blank_colon(lt_text_list, table_list, layout_bbox, image_np=None, show=0):
+    start_time = time.time()
+
+    # print('len(lt_text_list)', len(lt_text_list))
+    # for lt_text in lt_text_list:
+    #     print('lt_text', lt_text)
+
+    # 新增冒号提前判断
+    colon_cnt = 0
+    for lt_text in lt_text_list:
+        if re.search('[::]', lt_text.get_text()):
+            colon_cnt += 1
+    if colon_cnt <= 6:
+        log('pre judge colon_cnt <= 6')
+        return [], []
+
+    # 图片类型,限制lt_text_list个数,并且很多是单字的
+    if image_np is not None and len(lt_text_list) >= 60:
+        single_char_cnt = 0
+        for lt_text in lt_text_list:
+            if len(lt_text.get_text()) <= 1:
+                single_char_cnt += 1
+        # log('len(lt_text_list), single_char_cnt ' + str(len(lt_text_list)) + ' ' + str(single_char_cnt))
+        if single_char_cnt > 50 or single_char_cnt > 1/3 * len(lt_text_list):
+            return [], []
+
+    # raise
+    # 有些确定为非表格,也输出,防止后续YOLO判断为表格,搞乱数据
+    not_b_table_list = []
+
+    layout_h = int(layout_bbox[3])
+    layout_w = int(layout_bbox[2])
+
+    if show:
+        print('layout_w, layout_h', layout_w, layout_h)
+        show_image = np.full((layout_h, layout_w, 3), 255, dtype=np.uint8)
+
+    if show and image_np is not None:
+        image_np_show = copy.copy(image_np)
+        for lt_text in lt_text_list:
+            bbox = [int(x) for x in lt_text.bbox]
+            cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
+        cv2.imshow('image origin', image_np_show)
+        cv2.waitKey(0)
+
+    # pdf类型预处理
+    start_time1 = time.time()
+    if image_np is None:
+        # 把单个lt_text中,中间多个空格分割的分开
+        lt_text_list = split_lt_text_by_many_space(lt_text_list)
+
+        if show:
+            for lt_text in lt_text_list:
+                bbox = [int(x) for x in lt_text.bbox]
+                cv2.rectangle(show_image, bbox[:2], bbox[2:4], (0, 0, 255))
+            cv2.imshow('pdf preprocess', show_image)
+            cv2.waitKey(0)
+        # log('get_b_table_by_blank_colon pdf preprocess cost: ' + str(time.time()-start_time1))
+
+    # 图片类型预处理
+    start_time1 = time.time()
+    if image_np is not None:
+        # 删除空的
+        start_time2 = time.time()
+        lt_text_list = delete_empty_bbox(lt_text_list)
+        # print('delete_empty_bbox cost: ', time.time()-start_time2)
+
+        # ocr识别的文本框需处理后紧贴文本,才能依靠空白分行
+        start_time2 = time.time()
+        new_bbox_list = shrink_bbox(image_np, [x.bbox for x in lt_text_list])
+        # print('shrink_bbox cost: ', time.time()-start_time2)
+        start_time2 = time.time()
+        for i, lt_text in enumerate(lt_text_list):
+            lt_text.bbox = new_bbox_list[i]
+        # print('lt_text.bbox = new_bbox_list[i] cost: ', time.time()-start_time2)
+        # log('get_b_table_by_blank_colon image preprocess1 cost: ' + str(time.time()-start_time1))
+
+    # 计算单字平均距离
+    start_time1 = time.time()
+    all_char_cnt = 0
+    all_text_width = 0
+    for lt_text in lt_text_list:
+        all_char_cnt += len(lt_text.get_text())
+        all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
+    if all_char_cnt == 0:
+        return [], not_b_table_list
+    avg_char_width = all_text_width / all_char_cnt
+
+    # 图片类型预处理2
+    if image_np is not None:
+        # ocr识别的表格的值可能因空格分开,合并
+        lt_text_list = merge_same_bbox(lt_text_list, avg_char_width)
+
+        # bbox交叉,修复
+        lt_text_list = fix_cross_bbox(lt_text_list)
+        # log('get_b_table_by_blank_colon image preprocess2 cost: ' + str(time.time()-start_time1))
+
+    if show and image_np is not None:
+        image_np_show = copy.copy(image_np)
+        for lt_text in lt_text_list:
+            bbox = [int(x) for x in lt_text.bbox]
+            cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
+        cv2.imshow('image preprocess', image_np_show)
+        cv2.waitKey(0)
+
+    if show:
+        for lt_text in lt_text_list:
+            print('lt_text', lt_text)
+
+    # 过滤xy值过大过小的
+    temp_list = []
+    for lt_text in lt_text_list:
+        if min(lt_text.bbox) < 0 or max(lt_text.bbox) > 10000:
+            continue
+        temp_list.append(lt_text)
+    lt_text_list = temp_list
+
+    if show:
+        for lt_text in lt_text_list:
+            cv2.rectangle(show_image,
+                          (int(lt_text.bbox[0]), int(lt_text.bbox[1])),
+                          (int(lt_text.bbox[2]), int(lt_text.bbox[3])),
+                          (0, 0, 255)
+                          )
+        for table in table_list:
+            cv2.rectangle(show_image,
+                          (int(table.bbox[0]), int(table.bbox[1])),
+                          (int(table.bbox[2]), int(table.bbox[3])),
+                          (0, 255, 0)
+                          )
+
+    # 计算单字平均距离
+    all_char_cnt = 0
+    all_text_width = 0
+    for lt_text in lt_text_list:
+        all_char_cnt += len(lt_text.get_text())
+        all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
+    if all_char_cnt == 0:
+        return [], not_b_table_list
+    avg_char_width = all_text_width / all_char_cnt
+    if show:
+        print('avg_char_width', avg_char_width)
+
+    if image_np is None:
+        blank_width = 1 * avg_char_width
+    else:
+        blank_width = 1 * avg_char_width
+    if show:
+        print('blank_width', blank_width)
+
+    # 根据有边框表格位置,将该页分为多个区域
+    table_h_list = []
+    area_h_list = []
+    area_start_h = 0
+    table_list.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3]))
+    for table in table_list:
+        table_h_list.append([table.bbox[1], table.bbox[3]])
+        area_h_list.append([area_start_h, table.bbox[1]])
+        area_start_h = table.bbox[3]
+    area_h_list.append([area_start_h, layout_h])
+
+    if show:
+        for min_h, max_h in area_h_list:
+            print('area_h_list', min_h, max_h)
+            cv2.rectangle(show_image,
+                          (0, int(min_h)),
+                          (layout_w, int(max_h)),
+                          (255, 0, 0)
+                          )
+
+    lt_text_area_list = []
+    for area_min_h, area_max_h in area_h_list:
+        sub_area = []
+        for lt_text in lt_text_list:
+            if area_min_h <= lt_text.bbox[1] <= lt_text.bbox[3] <= area_max_h:
+                sub_area.append(lt_text)
+        lt_text_area_list.append(sub_area)
+    if show:
+        print('len(lt_text_area_list)', len(lt_text_area_list))
+
+    # 每个区域分别进行判断无边框表格
+    result_table_list = []
+    start_time1 = time.time()
+    for sub_lt_text_list in lt_text_area_list:
+        start_time2 = time.time()
+        lt_text_row_list = get_text_row_by_blank(sub_lt_text_list, layout_h)
+        # log('get_text_row_by_blank cost: ' + str(time.time()-start_time2))
+
+        # 有补充的占位lt_text,需添加到lt_text_list
+        for row in lt_text_row_list:
+            for lt_text in row:
+                if lt_text not in lt_text_list:
+                    lt_text_list.append(lt_text)
+
+        if show:
+            for row in lt_text_row_list:
+                print('row', row)
+
+        start_time2 = time.time()
+        b_table_list1, b_table_bbox_list1 = get_b_table_by_lt_text_row(lt_text_row_list)
+        # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))
+
+        # 确定区域后,对表格内重新分行,更精准
+        start_time2 = time.time()
+        table_lt_text_row_list = []
+        for bi, b_table in enumerate(b_table_list1):
+            b_table_bbox = b_table_bbox_list1[bi]
+            sub_lt_text_list = []
+            for lt_text in lt_text_list:
+                if b_table_bbox[1] <= lt_text.bbox[1] <= lt_text.bbox[3] <= b_table_bbox[3]:
+                    sub_lt_text_list.append(lt_text)
+            _lt_text_row_list, center_blank_row = get_text_row_by_center_blank(b_table, sub_lt_text_list, blank_width,
+                                                                               layout_h)
+            table_lt_text_row_list += _lt_text_row_list
+        # log('get_text_row_by_center_blank cost: ' + str(time.time()-start_time2))
+
+        start_time2 = time.time()
+        b_table_list3, b_table_bbox_list3 = get_b_table_by_lt_text_row(table_lt_text_row_list)
+        # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))
+
+        if show:
+            for b_table in b_table_list3:
+                print('b_table3', b_table)
+
+        # 对大致的表格进行列判断,表格内不同列的框不能交叉,可以重合,需有一定空白
+        start_time2 = time.time()
+        b_table_list2 = []
+        for b_table in b_table_list3:
+
+            blank_row_list = get_blank_row(b_table, blank_width)
+            if show:
+                print('b_table get_blank_row b_table_list3', b_table)
+                print('blank_row_list b_table_list3', blank_row_list)
+
+            b_table2 = []
+            for bi, lt_text_row1 in enumerate(b_table[:-1]):
+                lt_text_row2 = b_table[bi + 1]
+                # if row1_row2_has_same_col(lt_text_row1, lt_text_row2):
+                if row1_row2_has_same_blank(blank_row_list[bi], blank_row_list[bi + 1]):
+                    if lt_text_row1 not in b_table2:
+                        b_table2.append(lt_text_row1)
+                    if lt_text_row2 not in b_table2:
+                        b_table2.append(lt_text_row2)
+                else:
+                    # print('not cross blank', blank_row_list[bi], blank_row_list[bi + 1])
+                    if len(b_table2) >= 2:
+                        b_table_list2.append(b_table2)
+                    b_table2 = []
+            if len(b_table2) >= 2:
+                b_table_list2.append(b_table2)
+        # log('get_blank_row cost: ' + str(time.time()-start_time2))
+
+        if show:
+            for b_table2 in b_table_list2:
+                print('b_table2')
+                for lt_text_row in b_table2:
+                    print('b_table2 lt_text_row', lt_text_row)
+
+        start_time2 = time.time()
+        for bi, b_table2 in enumerate(b_table_list2):
+            # 根据冒号得到表格
+            start_time3 = time.time()
+            table2, center_blank_row, _not_b_table_bbox_list, table_bbox \
+                = get_b_table_by_colon(b_table2, blank_width)
+            log('get_b_table_by_colon cost: ' + str(time.time()-start_time3))
+            not_b_table_list += [[[], x] for x in _not_b_table_bbox_list]
+
+            if show and center_blank_row:
+                print('show center_blank_row', center_blank_row)
+                bx = int((center_blank_row[2] + center_blank_row[0]) / 2)
+                by = int((center_blank_row[3] + center_blank_row[1]) / 2)
+                br = int((center_blank_row[2] - center_blank_row[0]) / 2)
+                if br <= 5:
+                    br = 5
+                print('bx, by, br', bx, by, br)
+                cv2.circle(show_image, (bx, by), br, (0, 255, 0))
+
+            if show:
+                min_w, min_h, max_w, max_h = table_bbox
+                cv2.rectangle(show_image,
+                              (int(min_w), int(min_h)),
+                              (int(max_w), int(max_h)),
+                              (0, 255, 0)
+                              )
+
+            # 修复最后一行跨行
+            # table2 = fix_final_row(table2)
+
+            # 表格末尾有些只有一列的需补充
+            table2 = add_last_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)
+
+            table2 = add_first_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)
+
+            # table格式转化
+            table2 = table_list_to_dict(table2)
+
+            # 表格一些标准化,比如去掉占位符
+            table2 = standard_table(table2)
+
+            if table2:
+                result_table_list.append([table2, table_bbox])
+        # log('colon, add, standard cost: ' + str(time.time()-start_time2))
+
+    # log('get_b_table_by_blank_colon area get b_table cost: ' + str(time.time()-start_time1))
+
+    if show:
+        cv2.namedWindow("final result", cv2.WINDOW_NORMAL)
+        cv2.resizeWindow("final result", 768, 1024)
+        cv2.imshow('final result', show_image)
+        cv2.waitKey(0)
+
+    if show:
+        for table in result_table_list:
+            print('get_b_table_by_bbox table ', table)
+
+        for not_table_bbox in not_b_table_list:
+            print('not_table bbox ', not_table_bbox)
+
+    # log('get_b_table_by_blank_colon cost: ' + str(time.time()-start_time))
+    return result_table_list, not_b_table_list
+
+
+def get_b_table_by_lt_text_row(lt_text_row_list, show=0):
+    # 先大致确定区域,列数大于2的区域
+    b_table_list1 = []
+    b_table = []
+
+    for lt_text_row in lt_text_row_list:
+        if len(lt_text_row) >= 2:
+            b_table.append(lt_text_row)
+        else:
+            if len(b_table) >= 2:
+                b_table_list1.append(b_table)
+            b_table = []
+    if len(b_table) >= 2:
+        b_table_list1.append(b_table)
+
+    # 获取bbox
+    b_table_bbox_list = []
+    for b_table in b_table_list1:
+        x1 = min([y.bbox[0] for x in b_table for y in x])
+        y1 = min([y.bbox[1] for x in b_table for y in x])
+        x2 = max([y.bbox[2] for x in b_table for y in x])
+        y2 = max([y.bbox[3] for x in b_table for y in x])
+
+        b_table_bbox_list.append([x1, y1, x2, y2])
+
+    if show:
+        for b_table in b_table_list1:
+            print('b_table')
+            for lt_text_row in b_table:
+                print('b_table lt_text_row', lt_text_row)
+    return b_table_list1, b_table_bbox_list
+
+
+def row1_row2_has_same_col(row1, row2):
+    threshold = 5
+    blank_len = 2
+    cross_flag = 0
+    for lt_text1 in row1:
+        for lt_text2 in row2:
+            if lt_text2.bbox[0] - lt_text1.bbox[2] >= blank_len \
+                    or lt_text1.bbox[0] - lt_text2.bbox[2] >= blank_len \
+                    or lt_text1.bbox[0] - threshold <= lt_text2.bbox[0] < lt_text2.bbox[2] <= lt_text1.bbox[
+                2] + threshold \
+                    or lt_text2.bbox[0] - threshold <= lt_text1.bbox[0] < lt_text1.bbox[2] <= lt_text2.bbox[
+                2] + threshold:
+                pass
+            else:
+                cross_flag = 1
+    if cross_flag:
+        return False
+    else:
+        return True
+
+
+def get_blank_row(lt_text_row_list, blank_min_width, show=0):
+    # 获取空白行
+    blank_row_list = []
+    # blank_min_width = avg_char_width * 3
+    for lt_text_row in lt_text_row_list:
+        lt_text_row.sort(key=lambda x: x.bbox[0])
+        blank_row = []
+        if len(lt_text_row) < 2:
+            blank_row_list.append([])
+        else:
+            # 行内lt_text两两生成空白
+            for lt_text1 in lt_text_row:
+                sub_row = []
+                for lt_text2 in lt_text_row:
+                    if lt_text1 == lt_text2:
+                        continue
+                    # 必须从左到右
+                    if lt_text1.bbox[2] > lt_text2.bbox[0]:
+                        continue
+                    line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
+                    line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
+                    if line_iou(line1, line2) > 0:
+                        continue
+                    sub_row.append([min(lt_text1.bbox[2], lt_text2.bbox[0]),
+                                    min(lt_text1.bbox[3], lt_text2.bbox[1]),
+                                    max(lt_text1.bbox[2], lt_text2.bbox[0]),
+                                    max(lt_text1.bbox[3], lt_text2.bbox[1]),
+                                    ])
+                    if show:
+                        print('sub_row', lt_text1.get_text(), lt_text2.get_text(), sub_row[-1])
+
+                # 每个lt_text只找出其对应的最小的空白
+                if not sub_row:
+                    continue
+                sub_row.sort(key=lambda x: abs(x[0] - x[2]))
+                if show:
+                    print('sub_row[-1]', lt_text1.get_text(), sub_row[-1])
+
+                blank_row.append(sub_row[0])
+
+            # 判断最小距离,一行至少有一段空白大于最小距离
+            match_flag = 0
+            for r in blank_row:
+                if abs(r[2] - r[0]) >= blank_min_width:
+                    match_flag = 1
+                    break
+            if match_flag:
+                blank_row_list.append(blank_row)
+            else:
+                blank_row_list.append([])
+
+    return blank_row_list
+
+
+def row1_row2_has_same_blank(row1, row2):
+    # row1的任一空白,都能和row2的任一空白相交
+    cross_flag = 0
+    for blank1 in row1:
+        if cross_flag == 1:
+            break
+        for blank2 in row2:
+            if blank1[0] <= blank2[0] <= blank1[2] \
+                    or blank1[0] <= blank2[2] <= blank1[2] \
+                    or blank2[0] <= blank1[0] <= blank2[2] \
+                    or blank2[0] <= blank1[2] <= blank2[2]:
+                cross_flag = 1
+                break
+
+    if cross_flag:
+        return True
+    else:
+        return False
+
+
+@memory_decorator
+def get_b_table_by_colon(b_table, blank_width, show=0):
+    # print('into get_b_table_by_colon')
+
+    table_bbox = get_table_bbox(b_table)
+
+    # 有些确定为非表格,也输出,防止后续YOLO判断为表格,搞乱数据
+    not_table_bbox_list = []
+
+    #
+    # row_cnt_list = [len(x) in [2, 3, 4] for x in b_table]
+
+    # 所有行需是2列或4列,同一列算作一列
+    row_cnt_list = []
+    head_cnt_list = []
+    for row in b_table:
+        if not row:
+            continue
+        row.sort(key=lambda x: (x.bbox[0]))
+        col_cnt = 1
+        head_cnt = 0
+        if re.search('[::]', row[0].get_text()):
+            head_cnt += 1
+        for ci, col in enumerate(row):
+            if ci == 0:
+                continue
+            col1 = row[ci - 1]
+            col2 = row[ci]
+            line1 = [(col1.bbox[0], 0), (col1.bbox[2], 0)]
+            line2 = [(col2.bbox[0], 0), (col2.bbox[2], 0)]
+            if line_iou(line1, line2) >= 0.5:
+                continue
+            else:
+                col_cnt += 1
+                if re.search('[::]', col2.get_text()):
+                    head_cnt += 1
+        row_cnt_list.append(col_cnt in [2, 3, 4])
+        head_cnt_list.append(head_cnt)
+
+    if show:
+        print('row_cnt_list', row_cnt_list)
+        print('head_cnt_list', head_cnt_list)
+
+    if max(head_cnt_list) > 2:
+        if show:
+            for row in b_table:
+                print('head_cnt_list row', row)
+        return [], None, not_table_bbox_list, table_bbox
+
+    # 最后一行年月日可能会影响列数,不是234列
+    if row_cnt_list[-1] is False:
+        row_cnt_list = row_cnt_list[:-1]
+        b_table = b_table[:-1]
+        table_bbox = get_table_bbox(b_table)
+
+    row_cnt_list = list(set(row_cnt_list))
+    if not (len(row_cnt_list) == 1 and row_cnt_list[0] is True):
+        return [], None, not_table_bbox_list, table_bbox
+
+    # 至少有2个以上文本包含冒号
+    colon_cnt = 0
+    for lt_text_row in b_table:
+        for lt_text in lt_text_row:
+            if re.search('[::]', lt_text.get_text()) and re.search('[\u4e00-\u9fff]', lt_text.get_text()):
+                colon_cnt += 1
+    if show:
+        print('colon_cnt, len(table)', colon_cnt, len(b_table))
+    # if colon_cnt < 2:
+    if colon_cnt < len(b_table) / 2:
+        return [], None, not_table_bbox_list, table_bbox
+
+    blank_row_list = get_blank_row(b_table, blank_width)
+    if show:
+        print('b_table get_blank_row colon', b_table)
+        print('blank_row_list colon', blank_row_list)
+    # blank_row_list = [y for x in blank_row_list for y in x]
+    # print('blank_row_list2', blank_row_list)
+    # # 先选最长空白包含的所有空白
+    # blank_row_list.sort(key=lambda x: abs(x[0]-x[2]), reverse=True)
+    # max_blank = blank_row_list[0]
+    # if show:
+    #     print('max_blank', max_blank)
+    # if abs(max_blank[0]-max_blank[2]) <= 4 * avg_char_width:
+    #     return []
+    # max_col = []
+    # for blank_row_bbox in blank_row_list:
+    #     if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
+    #         max_col.append(blank_row_bbox)
+    # if show:
+    #     print('max_col', max_col)
+    # if not max_col:
+    #     return []
+    # # 选取被包含最多的空白
+    # blank_contain_cnt_dict = {}
+    # for bi, blank_row_bbox in enumerate(max_col):
+    #     blank_contain_cnt_dict[bi] = 0
+    #     for blank_row_bbox2 in max_col:
+    #         if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
+    #             blank_contain_cnt_dict[bi] += 1
+    # blank_contain_cnt_list = [[k, v] for k, v in blank_contain_cnt_dict.items()]
+    # blank_contain_cnt_list.sort(key=lambda x: x[1])
+    # if show:
+    #     print('blank_contain_cnt_list', blank_contain_cnt_list)
+    # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]
+
+    center_blank_row = choose_center_blank(blank_row_list, blank_width)
+    if show:
+        print('center_blank_row', center_blank_row)
+
+    # 获取中心最短的空白,作为参考
+    # blank_list = [get_blank_row(x) for x in b_table]
+    # blank_list = [x[0] if len(x) == 1 else x[1] for x in blank_list]
+    # blank_list.sort(key=lambda x: abs(x[2] - x[0]))
+    # center_blank = blank_list[0]
+    #
+    # print('center_blank', center_blank)
+
+    # 根据中心空白,分为两列
+    # col_list1 = []
+    # col_list2 = []
+    # col_box_dict = {}
+    # for lt_text_row in b_table:
+    #     lt_text_row.sort(key=lambda x: x.bbox[0])
+    #     # if len(lt_text_row) == 4:
+    #     #     text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
+    #     #     text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
+    #     #     box1 = [
+    #     #         min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
+    #     #         max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
+    #     #         min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
+    #     #         max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
+    #     #     ]
+    #     #     box2 = [
+    #     #         min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
+    #     #         max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
+    #     #         min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
+    #     #         max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
+    #     #     ]
+    #     #
+    #     #     # col_list1.append(text1)
+    #     #     # col_list2.append(text2)
+    #     # else:
+    #     #     text1 = lt_text_row[0].get_text()
+    #     #     text2 = lt_text_row[1].get_text()
+    #     #     box1 = lt_text_row[0].bbox
+    #     #     box2 = lt_text_row[1].bbox
+    #
+    #     left_col = []
+    #     right_col = []
+    #     for lt_text in lt_text_row:
+    #         if lt_text.bbox[2] <= center_blank_row[0]:
+    #             left_col.append(lt_text)
+    #         else:
+    #             right_col.append(lt_text)
+    #
+    #     left_text = [x.get_text() for x in left_col]
+    #     left_text = ''.join(left_text)
+    #     right_text = [x.get_text() for x in right_col]
+    #     right_text = ''.join(right_text)
+    #
+    #     text1 = left_text.strip()
+    #     text2 = right_text.strip()
+    #
+    #     # if text1 in col_box_dict.keys():
+    #     #     col_box_dict[text1] += [box1]
+    #     # else:
+    #     #     col_box_dict[text1] = [box1]
+    #     # if text2 in col_box_dict.keys():
+    #     #     col_box_dict[text2] += [box2]
+    #     # else:
+    #     #     col_box_dict[text2] = [box2]
+    #
+    #     col_list1.append(text1)
+    #     col_list2.append(text2)
+    #
+    # if show:
+    #     print('col_list1', col_list1)
+    #     print('col_list2', col_list2)
+
+    # col_key_value_list1 = []
+    # last_key = ""
+    # for col1 in col_list1:
+    #     match = re.search('[::]+', col1)
+    #     # 有冒号的
+    #     if match:
+    #         key = col1[:match.end()]
+    #         if last_key:
+    #             key = last_key + key
+    #             last_key = ""
+    #         value = col1[match.end():]
+    #         col_key_value_list1.append([key, value])
+    #     # 没有冒号的
+    #     else:
+    #         # 如果该值也存在在col_list2里,则看做表头,和下一行的表头连在一起
+    #         if col1 in col_list2:
+    #             if show:
+    #                 print('col1 in col_list2')
+    #             last_key = col1
+    #         # 不存在,则是上一行的值,和上一行的值连在一起
+    #         else:
+    #             if col_key_value_list1 and re.search('[::]', col_key_value_list1[-1][1]):
+    #                 col_key_value_list1[-1][1] += col1
+    #             else:
+    #                 col_key_value_list1.append(["", col1])
+    #
+    # if show:
+    #     print('col_key_value_list1', col_key_value_list1)
+    #
+    # col_key_value_list2 = []
+    # last_key = ""
+    # for col2 in col_list2:
+    #     match = re.search('[::]+', col2)
+    #     if match:
+    #         key = col2[:match.end()]
+    #         if last_key:
+    #             key = last_key + key
+    #             last_key = ""
+    #         value = col2[match.end():]
+    #         col_key_value_list2.append([key, value])
+    #     else:
+    #         # 如果该值也存在在col_list1里,则看做表头,和下一行的表头连在一起
+    #         if col2 in col_list1:
+    #             if show:
+    #                 print('col2 in col_list1')
+    #             last_key = col2
+    #         # 不存在,则是上一行的值,和上一行的值连在一起
+    #         else:
+    #             if col_key_value_list2 and re.search('[::]', col_key_value_list2[-1][1]):
+    #                 col_key_value_list2[-1][1] += col2
+    #             else:
+    #                 col_key_value_list2.append(["", col2])
+    #
+    # if show:
+    #     print('col_key_value_list2', col_key_value_list2)
+
+    if not center_blank_row:
+        return [], None, not_table_bbox_list, table_bbox
+
+    # 根据中心空白,分为两列
+    col_list1, col_list2 = divide_2_col_by_center_blank(b_table, center_blank_row)
+    # 非表格,一般是那种一行里键值离的较远的单列,加入非表格,后续yolo判断也忽略
+    if not col_list1 and not col_list2:
+        not_table_bbox = get_table_bbox(b_table)
+        not_table_bbox_list.append(not_table_bbox)
+        return [], None, not_table_bbox_list, table_bbox
+
+    # 两列中,分别设置head value
+    col_key_value_list1 = set_head_value_in_col(col_list1, col_list2)
+    col_key_value_list2 = set_head_value_in_col(col_list2, col_list1)
+
+    # 根据两列head value,形成行
+    b_table_row_list = []
+    for i in range(max(len(col_key_value_list1), len(col_key_value_list2))):
+        if i >= len(col_key_value_list1):
+            col1 = ["", ""]
+        else:
+            col1 = col_key_value_list1[i]
+        if i >= len(col_key_value_list2):
+            col2 = ["", ""]
+        else:
+            col2 = col_key_value_list2[i]
+
+        row = col1[:2] + col2[:2]
+        b_table_row_list.append(row)
+
+    # 删除空白列
+    # col_dict = {}
+    # for row in b_table_row_list:
+    #     for col_i, col in enumerate(row):
+    #         if col_i in col_dict.keys():
+    #             col_dict[col_i] += [col]
+    #         else:
+    #             col_dict[col_i] = [col]
+    # delete_col_i = []
+    # for col_i, cols in col_dict.items():
+    #     cols = list(set(cols))
+    #     if len(cols) == 1 and cols[0] == '':
+    #         delete_col_i.append(col_i)
+    #
+    # temp_list = []
+    # for row in b_table_row_list:
+    #     new_col = []
+    #     for col_i, col in enumerate(row):
+    #         if col_i in delete_col_i:
+    #             continue
+    #         new_col.append(col)
+    #     temp_list.append(new_col)
+    # b_table_row_list = temp_list
+
+    # 去掉删除空白列
+    # b_table_row_list = delete_blank_col(b_table_row_list)
+
+    # 修复因表头和值是同一列上下排列,导致的错位
+    b_table_row_list = fix_head_value_match(b_table_row_list)
+
+    if show:
+        print('b_table_row_list', b_table_row_list)
+    return b_table_row_list, center_blank_row, not_table_bbox_list, table_bbox
+
+
+@memory_decorator
+def get_text_row_by_blank(lt_text_list, layout_h, show=0):
+    if show:
+        for lt_text_row in lt_text_list:
+            print('lt_text_111', lt_text_row)
+    lt_text_blank_list = get_up_down_blank(lt_text_list)
+    lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
+    if show:
+        for lt_text_row in lt_text_row_list:
+            print('lt_text_row', lt_text_row)
+
+    return lt_text_row_list
+
+
+def get_text_row_by_center_blank(b_table, lt_text_list, blank_width, layout_h, show=0):
+    # 获取行空白
+    blank_row_list = get_blank_row(b_table, blank_width)
+    if show:
+        print('b_table get_blank_row center_blank', b_table)
+        print('blank_row_list center_blank', blank_row_list)
+
+    # 获取中心空白
+    center_blank_row = choose_center_blank(blank_row_list, blank_width)
+    if show:
+        print('center_blank_row center', center_blank_row)
+    if not center_blank_row:
+        return [], []
+
+    center_x = (center_blank_row[2] + center_blank_row[0]) / 2
+
+    lt_text_blank_list = get_up_down_blank(lt_text_list, center_x=center_x)
+
+    lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
+
+    if show:
+        for lt_text_row in lt_text_row_list:
+            print('lt_text_row center', lt_text_row)
+
+    return lt_text_row_list, center_blank_row
+
+
+def table_list_to_dict(table):
+    table_dict_list = []
+    for row in table:
+        new_row = []
+        for col in row:
+            col_dict = {
+                'rowspan': 1,
+                'columnspan': 1,
+                'text': col
+            }
+            new_row.append(col_dict)
+        table_dict_list.append(new_row)
+    return table_dict_list
+
+
+@memory_decorator
+def get_up_down_blank(lt_text_list, center_x=None, show=0):
+    # 根据文本上下的空白分行
+    lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
+    lt_text_blank_list = []
+    for i in range(len(lt_text_list)):
+        lt_text1 = lt_text_list[i]
+        line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
+        if center_x is not None:
+            left_or_right1 = 0 if (lt_text1.bbox[0] + lt_text1.bbox[2]) / 2 <= center_x else 1
+
+        up_blank_list = []
+        down_blank_list = []
+        for j in range(len(lt_text_list)):
+            lt_text2 = lt_text_list[j]
+            if lt_text1 == lt_text2:
+                continue
+
+            # 没有中间列分割
+            if center_x is None:
+                line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
+                iou = line_iou(line1, line2)
+                if lt_text2.bbox[1] > lt_text1.bbox[3] and iou > 0:
+                    down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
+                if lt_text2.bbox[3] < lt_text1.bbox[1] and iou > 0:
+                    up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
+                # if lt_text1.bbox[1] > lt_text2.bbox[3] and iou > 0:
+                #     down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
+                # if lt_text1.bbox[3] < lt_text2.bbox[1] and iou > 0:
+                #     up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
+            # 有中间列分割
+            else:
+                left_or_right2 = 0 if (lt_text2.bbox[0] + lt_text2.bbox[2]) / 2 <= center_x else 1
+                if lt_text2.bbox[1] > lt_text1.bbox[3] and left_or_right1 == left_or_right2:
+                    down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
+                if lt_text2.bbox[3] < lt_text1.bbox[1] and left_or_right1 == left_or_right2:
+                    up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
+                # if lt_text1.bbox[1] > lt_text2.bbox[3] and left_or_right1 == left_or_right2:
+                #     down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
+                # if lt_text1.bbox[3] < lt_text2.bbox[1] and left_or_right1 == left_or_right2:
+                #     up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
+
+        # 找不到的,空白设置为自身text高度
+        text_h = abs(lt_text1.bbox[3] - lt_text1.bbox[1])
+        if not up_blank_list:
+            up_blank_list.append([max(0, lt_text1.bbox[1] - text_h), lt_text1.bbox[1]])
+        if not down_blank_list:
+            down_blank_list.append([lt_text1.bbox[3], lt_text1.bbox[3] + text_h])
+
+        down_blank = down_blank_list[0]
+        up_blank = up_blank_list[-1]
+
+        if show:
+            print('lt_text1.get_text()', lt_text1.get_text(), lt_text1.bbox)
+            if center_x is not None:
+                print('center_x', center_x)
+            print('up_blank', up_blank)
+            print('down_blank', down_blank)
+
+        lt_text_blank_list.append([lt_text1, up_blank, down_blank])
+    return lt_text_blank_list
+
+
+@memory_decorator
+def filter_large_blank_row(lt_text_blank_list, layout_h, show=0):
+    # 先过滤空白过大的,单独成行
+    lt_text_row_list = []
+    single_lt_text_list = []
+    max_blank_h = layout_h / 6
+    index = 0
+    threshold = 20
+    lt_text_blank_list.sort(key=lambda x: (x[0].bbox[1], x[0].bbox[0]))
+    for lt_text1, up_blank1, down_blank1 in lt_text_blank_list:
+        row = []
+        # 空白高度大于一定值,单独一行
+        match_flag = 0
+        # 在最下方的lt_text,判断上空白
+        if index >= len(lt_text_blank_list) - 4 \
+                and abs(up_blank1[0] - up_blank1[1]) >= max_blank_h:
+            if show:
+                print('match single lt_text 1')
+            match_flag = 1
+        # 在最上方的lt_text,判断下空白
+        elif index <= 2 \
+                and abs(down_blank1[0] - down_blank1[1]) >= max_blank_h:
+            if show:
+                print('match single lt_text 2')
+            match_flag = 1
+        # 在中间的,上下一起判断
+        elif 2 <= index <= len(lt_text_blank_list) - 4 \
+                and abs(up_blank1[0] - down_blank1[1]) >= max_blank_h:
+            # 判断没有同行的
+            has_same_row_flag = 0
+            for lt_text2, _, _ in lt_text_blank_list:
+                if lt_text1 == lt_text2:
+                    continue
+                if lt_text1.bbox[1] - threshold <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= lt_text1.bbox[3] + threshold:
+                    has_same_row_flag = 1
+                    break
+            if has_same_row_flag:
+                match_flag = 0
+            else:
+                match_flag = 1
+            if show:
+                print('match single lt_text 3')
+
+        if match_flag:
+            row.append(lt_text1)
+            lt_text_row_list.append(row)
+            single_lt_text_list.append(lt_text1)
+        index += 1
+
+    if show:
+        print('single_lt_text_list', single_lt_text_list)
+    return lt_text_row_list, single_lt_text_list
+
+
+@memory_decorator
+def get_contain_blank_row(lt_text_blank_list, layout_h, show=0):
+    from format_convert.convert_tree import TextBox
+    lt_text_row_list, single_lt_text_list = filter_large_blank_row(lt_text_blank_list, layout_h)
+    single_lt_text_list = set(single_lt_text_list)
+
+    # 空白互相包含的就是同一行
+    time1 = time.time()
+    threshold = 5
+    used_lt_text_list = set([])
+    another_used_lt_text_list = set([])
+    for i1 in range(len(lt_text_blank_list)):
+        time2 = time.time()
+        lt_text1, up_blank1, down_blank1 = lt_text_blank_list[i1]
+        row = []
+        if lt_text1 in single_lt_text_list:
+            continue
+        for i2 in range(len(lt_text_blank_list)):
+            lt_text2, up_blank2, down_blank2 = lt_text_blank_list[i2]
+            if lt_text1 == lt_text2:
+                continue
+            if lt_text2 in another_used_lt_text_list:
+                continue
+            if lt_text2 in used_lt_text_list and lt_text1.bbox[1] >= lt_text2.bbox[3]:
+                continue
+            if lt_text2 in single_lt_text_list:
+                continue
+
+            # 单独上空白包含上空白,下空白包含下空白
+            if (up_blank1[0] - threshold <= up_blank2[0] <= up_blank2[1] <= up_blank1[1] + threshold) \
+                    or (down_blank1[0] - threshold <= down_blank2[0] <= down_blank2[1] <= down_blank1[1] + threshold):
+                    # or (up_blank2[0] - threshold <= up_blank1[0] <= up_blank1[1] <= up_blank2[1] + threshold) \
+                    # or (down_blank2[0] - threshold <= down_blank1[0] <= down_blank1[1] <= down_blank2[1] + threshold):
+                if lt_text2 not in row:
+                    row.append(lt_text2)
+                    used_lt_text_list.add(lt_text2)
+
+            # 若是上下空白包含了另一个的文本部分,也成立
+            # if up_blank1[0] <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= down_blank1[1]:
+            #     if lt_text2 not in row:
+            #         row.append(lt_text2)
+            #         used_lt_text_list.append(lt_text2)
+
+
+
+        if lt_text1 not in row:
+            row.append(lt_text1)
+
+        if show:
+            print('get_contain_blank_row loop2 cost:', time.time()-time2)
+
+        # 若一个row中有3个带冒号的,说明误把一个单独行合进来了,分开
+        time2 = time.time()
+        colon_cnt = 0
+        colon_lt_text = []
+        for lt in row:
+            if re.search('[::]', lt.get_text()):
+                colon_cnt += 1
+                colon_lt_text.append(lt)
+        if colon_cnt >= 3:
+            if show:
+                print('colon_cnt >= 3 row', row)
+
+            another_lt_text_list = find_outline_lt_text(row)
+
+            # # 把y最大的lt_text单独放一行
+            # colon_lt_text.sort(key=lambda x: x.bbox[1])
+            # # 除了前两个,其他都单放一行
+            # another_lt_text_list = colon_lt_text[2:]
+            for lt_text in another_lt_text_list:
+                if lt_text in row:
+                    row.remove(lt_text)
+                if lt_text in colon_lt_text:
+                    colon_lt_text.remove(lt_text)
+
+            if show:
+                print('another_lt_text_list', another_lt_text_list)
+                print('colon_lt_text', colon_lt_text)
+
+            if not colon_lt_text:
+                continue
+
+            colon_lt_text.sort(key=lambda x: x.bbox[0])
+            lt_text_row_list.append(row)
+            for another_lt_text in another_lt_text_list:
+                if abs(another_lt_text.bbox[0] - colon_lt_text[0].bbox[0]) > abs(
+                        another_lt_text.bbox[0] - colon_lt_text[-1].bbox[0]):
+                    new_bbox = [colon_lt_text[0].bbox[0], another_lt_text.bbox[1],
+                                colon_lt_text[0].bbox[2], another_lt_text.bbox[3]]
+                    another_row = [TextBox(text="@@:", bbox=new_bbox), another_lt_text]
+                else:
+                    new_bbox = [colon_lt_text[-1].bbox[0], another_lt_text.bbox[1],
+                                colon_lt_text[-1].bbox[2], another_lt_text.bbox[3]]
+                    # 新增一列占位
+                    another_row = [another_lt_text, TextBox(text="@@:", bbox=new_bbox)]
+                if show:
+                    print('another_row', another_row)
+                for lt_text3 in another_row:
+                    another_used_lt_text_list.add(lt_text3)
+                lt_text_row_list.append(another_row)
+        else:
+            lt_text_row_list.append(row)
+
+        if show:
+            print('get_contain_blank_row judge colon cost:', time.time()-time2)
+
+    if show:
+        print('get_contain_blank_row double loop cost: ', time.time()-time1)
+
+    # 去重
+    lt_text_row_list.sort(key=lambda x: len(x), reverse=True)
+    if show:
+        for lt_text_row in lt_text_row_list:
+            print('before dedup lt_text_row', lt_text_row)
+
+    lt_text_row_list = merge_intersecting_lists(lt_text_row_list)
+
+    if show:
+        for lt_text_row in lt_text_row_list:
+            print('after dedup lt_text_row', lt_text_row)
+
+    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
+
+    # 剔除全是空白的行
+    temp_list = []
+    for lt_text_row in lt_text_row_list:
+        row_text = ""
+        for lt_text in lt_text_row:
+            row_text += lt_text.get_text()
+        if re.sub('\s+', '', row_text) == "":
+            continue
+        temp_list.append(lt_text_row)
+    lt_text_row_list = temp_list
+    return lt_text_row_list
+
+
+def choose_center_blank(blank_row_list, blank_width, show=0):
+    if not blank_row_list:
+        return []
+
+    # 先选最长空白包含的所有空白
+    blank_list = [y for x in blank_row_list for y in x]
+    if not blank_list:
+        return []
+
+    blank_list.sort(key=lambda x: abs(x[0] - x[2]), reverse=True)
+    max_blank = blank_list[0]
+    if show:
+        print('max_blank', max_blank)
+    if abs(max_blank[0] - max_blank[2]) <= blank_width:
+        return []
+
+    max_col = []
+    for blank_row in blank_row_list:
+        if not blank_row:
+            continue
+
+        # # 找出每一行最大的空白列,但是同一列中则选列中最小的空白
+        # # 空白分列
+        # blank_row.sort(key=lambda x: (x[0], x[1]))
+        # last_blank_bbox = blank_row[0]
+        # blank_col = []
+        # blank_col_list = []
+        # for blank_bbox in blank_row[1:]:
+        #     line1 = ([blank_bbox[0], 0], [blank_bbox[2], 0])
+        #     line2 = ([last_blank_bbox[0], 0], [last_blank_bbox[2], 0])
+        #     if line_iou(line1, line2) >= 0.7:
+        #         blank_col += [blank_bbox, last_blank_bbox]
+        #     else:
+        #         blank_col.sort(key=lambda x: abs(x[2] - x[0]))
+        #         blank_col_list.append(blank_col)
+        #         blank_col = []
+        #     last_blank_bbox = blank_bbox
+
+        # 选最大的列
+        max_blank_bbox = blank_row[0]
+        for blank_bbox in blank_row[1:]:
+            if abs(blank_bbox[0] - blank_bbox[2]) > abs(max_blank_bbox[0] - max_blank_bbox[2]):
+                max_blank_bbox = blank_bbox
+
+        if show:
+            print('max_blank_bbox, blank_row', max_blank_bbox, blank_row)
+
+        line1 = ([max_blank[0], 0], [max_blank[2], 0])
+        line2 = ([max_blank_bbox[0], 0], [max_blank_bbox[2], 0])
+        iou = line_iou(line1, line2)
+        # if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
+        if iou >= 0.5:
+            max_col.append(max_blank_bbox)
+    if show:
+        print('max_col', max_col)
+    if not max_col:
+        return []
+
+    # # 选取被包含最多的空白
+    # # 选取交集最多的空白,相同数量则最短
+    # blank_contain_cnt_dict = {}
+    # for bi, blank_row_bbox in enumerate(max_col):
+    #     blank_contain_cnt_dict[bi] = 0
+    #     for blank_row_bbox2 in max_col:
+    #         line1 = ([blank_row_bbox2[0], 0], [blank_row_bbox2[2], 0])
+    #         line2 = ([blank_row_bbox[0], 0], [blank_row_bbox[2], 0])
+    #         iou = line_iou(line1, line2)
+    #         # if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
+    #         if iou >= 0.2:
+    #             blank_contain_cnt_dict[bi] += 1
+    # blank_contain_cnt_list = [[k, v, abs(max_col[k][2] - max_col[k][0])/2] for k, v in blank_contain_cnt_dict.items()]
+    # blank_contain_cnt_list.sort(key=lambda x: (x[1], -x[2]))
+    # if show:
+    #     print('blank_contain_cnt_list', blank_contain_cnt_list)
+    # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]
+
+    # 选取交集部分
+    center_blank_row = get_inter_part(max_col)
+    return center_blank_row
+
+
+def set_head_value_in_col(col_list1, col_list2, show=0):
+    # 在列中设置 表头和值
+    col_key_value_list = []
+    last_key = ""
+    for col1 in col_list1:
+        match = re.search('[::]+', col1)
+        # 有冒号的
+        if match:
+            key = col1[:match.end()]
+            if last_key:
+                key = last_key + key
+                last_key = ""
+            value = col1[match.end():]
+            col_key_value_list.append([key, value])
+        # 没有冒号的
+        else:
+            # 如果该值也存在在col_list2里,则看做表头,和下一行的表头连在一起
+            if col1 in col_list2:
+                if show:
+                    print('col1 in col_list2')
+                # 若上一行也是无冒号的,直接加入一行
+                if last_key:
+                    col_key_value_list.append(["", last_key])
+                    last_key = ''
+                last_key = col1
+            # 不存在,则是上一行的值,和上一行的值连在一起
+            else:
+                if col_key_value_list and re.search('[::]', col_key_value_list[-1][1]):
+                    col_key_value_list[-1][1] += col1
+                else:
+                    col_key_value_list.append(["", col1])
+
+    # 如果是最后一行没有冒号的,col1 col2都有的,直接当做一行
+    if last_key:
+        col_key_value_list.append(["", last_key])
+
+    if show:
+        print('col_key_value_list', col_key_value_list)
+
+    return col_key_value_list
+
+
+def divide_2_col_by_center_blank(b_table, center_blank_row, show=0):
+    # 根据中心空白,分为两列
+    col_list1 = []
+    col_list2 = []
+    col_box_dict = {}
+    for lt_text_row in b_table:
+        lt_text_row.sort(key=lambda x: x.bbox[0])
+        # if len(lt_text_row) == 4:
+        #     text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
+        #     text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
+        #     box1 = [
+        #         min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
+        #         max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
+        #         min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
+        #         max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
+        #     ]
+        #     box2 = [
+        #         min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
+        #         max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
+        #         min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
+        #         max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
+        #     ]
+        #
+        #     # col_list1.append(text1)
+        #     # col_list2.append(text2)
+        # else:
+        #     text1 = lt_text_row[0].get_text()
+        #     text2 = lt_text_row[1].get_text()
+        #     box1 = lt_text_row[0].bbox
+        #     box2 = lt_text_row[1].bbox
+
+        left_col = []
+        right_col = []
+        for lt_text in lt_text_row:
+            if (lt_text.bbox[2] + lt_text.bbox[0]) / 2 <= abs(center_blank_row[0] + center_blank_row[2]) / 2:
+                left_col.append(lt_text)
+            else:
+                right_col.append(lt_text)
+
+        # 按阅读顺序排序
+        left_col = sort_by_read_order(left_col)
+        left_text = [x.get_text() for x in left_col]
+        left_text = ''.join(left_text)
+        right_col = sort_by_read_order(right_col)
+        right_text = [x.get_text() for x in right_col]
+        right_text = ''.join(right_text)
+
+        text1 = left_text.strip()
+        text2 = right_text.strip()
+
+        col_list1.append(text1)
+        col_list2.append(text2)
+
+    if show:
+        print('col_list1', col_list1)
+        print('col_list2', col_list2)
+
+    # 两列都必须有冒号,否则就是非2列表格
+    colon_cnt1 = 0
+    colon_cnt2 = 0
+    for col in col_list1:
+        if re.search('[::]', col):
+            colon_cnt1 += 1
+    for col in col_list2:
+        if re.search('[::]', col):
+            colon_cnt2 += 1
+
+    if colon_cnt1 < len(col_list1) / 3 or colon_cnt2 < len(col_list2) / 3:
+        col_list1 = []
+        col_list2 = []
+        if show:
+            print('col_list1 colon_cnt1 less', colon_cnt1)
+            print('col_list2 colon_cnt2 less', colon_cnt2)
+
+    return col_list1, col_list2
+
+
+def delete_blank_col(b_table_row_list):
+    # 删除空白列
+    col_dict = {}
+    for row in b_table_row_list:
+        for col_i, col in enumerate(row):
+            if col_i in col_dict.keys():
+                col_dict[col_i] += [col]
+            else:
+                col_dict[col_i] = [col]
+    delete_col_i = []
+    for col_i, cols in col_dict.items():
+        cols = list(set(cols))
+        if len(cols) == 1 and cols[0] == '':
+            delete_col_i.append(col_i)
+
+    temp_list = []
+    for row in b_table_row_list:
+        new_col = []
+        for col_i, col in enumerate(row):
+            if col_i in delete_col_i:
+                continue
+            new_col.append(col)
+        temp_list.append(new_col)
+    b_table_row_list = temp_list
+    return b_table_row_list
+
+
+def fix_head_value_match(b_table, show=0):
+    if not b_table:
+        return b_table
+    if len(b_table[0]) != 4:
+        return b_table
+    maybe_head_index = None
+    match_head_value_dict = {}
+    # 修复值跨行
+    for row_i, row in enumerate(b_table):
+        if maybe_head_index is None:
+            if row[1] in ["", '@@:'] and row[3] in ["", '@@:']:
+                match1 = re.search("[::]", row[0])
+                match2 = re.search("[::]", row[2])
+                if match1 and match2:
+                    maybe_head_index = row_i
+        else:
+            if row[0] in ["", '@@:'] and row[2] in ["", '@@:'] and row[1] not in ["", '@@:'] and row[3] not in ["", '@@:']:
+                if maybe_head_index in match_head_value_dict.keys():
+                    match_head_value_dict[maybe_head_index] += [row_i]
+                else:
+                    match_head_value_dict[maybe_head_index] = [row_i]
+            else:
+                maybe_head_index = None
+
+    if show:
+        print('match_head_value_dict', match_head_value_dict)
+
+    add_row_dict = {}
+    delete_head_index_list = []
+    delete_value_index_list = []
+    for row_index, value_index_list in match_head_value_dict.items():
+        head_row = b_table[row_index]
+        delete_head_index_list.append(row_index)
+        left_value_text = ""
+        right_value_text = ""
+        for value_index in value_index_list:
+            value_row = b_table[value_index]
+            delete_value_index_list.append(value_index)
+            for col in value_row[:2]:
+                left_value_text += col
+            for col in value_row[2:]:
+                right_value_text += col
+        head_row[1] = left_value_text
+        head_row[3] = right_value_text
+        add_row_dict[row_index] = head_row
+
+    # 删掉原来的,加上新的row
+    temp_list = []
+    for row_i, row in enumerate(b_table):
+        if row_i in delete_head_index_list:
+            temp_list.append(add_row_dict.get(row_i))
+            continue
+        if row_i in delete_value_index_list:
+            continue
+        temp_list.append(row)
+    b_table = temp_list
+    return b_table
+
+
+def add_last_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
+                  table_lt_text_row_list, show=0):
+    if not b_table:
+        return b_table
+    if len(b_table[0]) not in [4]:
+        return b_table
+
+    blank_h_list = []
+    max_h_list = []
+    for lt_text_row in table_lt_text_row_list:
+        if not lt_text_row:
+            continue
+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
+        max_h_list.append(max_h)
+    max_h_list.sort(key=lambda x: x)
+    for i in range(1, len(max_h_list)):
+        blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
+    mean_blank_h = np.mean(blank_h_list)
+    if show:
+        print('add_last_rows blank_width_list', blank_h_list)
+        print('add_last_rows mean_blank_h', mean_blank_h)
+
+    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
+    match_row_list = []
+    threshold = 5
+    add_blank_h = mean_blank_h + threshold
+    for li, lt_text_row in enumerate(lt_text_row_list):
+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
+        if show:
+            print('max_h > table_bbox[3]', lt_text_row, max_h, table_bbox[3])
+        # 高度需要在表格y2和y2加上空白的距离间
+        if table_bbox[3] < max_h < table_bbox[3] + add_blank_h:
+            # lt_text x轴上穿过了中心bbox,则跳过
+            if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
+                print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
+                continue
+
+            # 左边需在表格x1和中心x1之间
+            if table_bbox[0] - threshold <= min_w < center_blank_bbox[0]:
+                match_row_list.append([lt_text_row, 0, max_h])
+            # 右边需在表格x2和中心x2之间
+            elif center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3:
+                match_row_list.append([lt_text_row, 1, max_h])
+            else:
+                print('center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3')
+                break
+
+            add_blank_h = add_blank_h + mean_blank_h + threshold
+
+    if show:
+        print('add_last_rows match_row_list', match_row_list)
+
+    add_b_table = []
+    real_max_h = None
+    for mi, match_row in enumerate(match_row_list):
+        lt_text_row, is_right, max_h = match_row
+        lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
+        # 只有一列
+        if len(lt_text_row) == 1:
+            text = lt_text_row[0].get_text()
+            match = re.search('[::]+', text)
+            real_max_h = max_h
+            if not match:
+                head = ""
+                value = text
+            else:
+                head = text[:match.end()]
+                value = text[match.end():]
+        # 或 两列,其实是表头由于空白被隔开
+        elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
+                and lt_text_row[1].get_text()[-1] in [':', ":"]:
+            text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
+            head = text
+            value = ''
+        # 两列
+        elif len(lt_text_row) == 2:
+            text1 = lt_text_row[0].get_text()
+            match = re.search('[::]+', text1)
+            if not match:
+                break
+            real_max_h = max_h
+            head = text1
+            value = lt_text_row[1].get_text()
+        else:
+            if show:
+                print('add_last_rows len(lt_text_row) break', len(lt_text_row))
+            break
+
+        # 获取上一行,可能需要将值补到上一行
+        if mi == 0 or len(add_b_table) == 0:
+            last_row = b_table[-1]
+            last_flag = 0
+        else:
+            last_row = add_b_table[-1]
+            last_flag = 1
+
+        if is_right:
+            if last_row[2] and not last_row[3] and not head and value:
+                b_table[-1][3] = value
+                current_row = ["", "", last_row[2], value]
+            else:
+                current_row = ["", "", head, value]
+        else:
+            if last_row[0] and not last_row[1] and not head and value:
+                current_row = [last_row[0], value, "", ""]
+            else:
+                current_row = [head, value, "", ""]
+
+        # if last_flag == 0:
+        #     b_table = b_table[:-1]
+        add_b_table.append(current_row)
+
+        if show:
+            print('current_row', current_row)
+
+    if show:
+        print('add_b_table', add_b_table)
+
+    b_table += add_b_table
+    if real_max_h is not None:
+        table_bbox[3] = real_max_h
+    return b_table
+
+
+def add_first_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
+                   table_lt_text_row_list, show=0):
+    if not b_table:
+        return b_table
+    if len(b_table[0]) not in [4]:
+        return b_table
+
+    blank_h_list = []
+    max_h_list = []
+    for lt_text_row in table_lt_text_row_list:
+        if not lt_text_row:
+            continue
+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
+        max_h_list.append(max_h)
+    max_h_list.sort(key=lambda x: x)
+    for i in range(1, len(max_h_list)):
+        blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
+    mean_blank_h = np.mean(blank_h_list)
+    if show:
+        print('add_first_rows blank_width_list', blank_h_list)
+        print('add_first_rows mean_blank_h', mean_blank_h)
+
+    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
+    match_row_list = []
+    threshold = 5
+    add_blank_h = mean_blank_h + threshold
+    for li, lt_text_row in enumerate(lt_text_row_list):
+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
+        if show:
+            print('min_h < table_bbox[3]', lt_text_row, min_h, table_bbox[3])
+        # 高度需要有一部分在在表格中
+        if min_h <= table_bbox[1] < max_h:
+            # lt_text x轴上穿过了中心bbox,则跳过
+            if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
+                print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
+                continue
+            # match_row_list.append([lt_text_row, 1, min_h])
+
+            # 中心x1左边
+            if min_w < center_blank_bbox[0]:
+                match_row_list.append([lt_text_row, 0, min_h])
+            # 中心x2右边
+            elif center_blank_bbox[2] < max_w:
+                match_row_list.append([lt_text_row, 1, min_h])
+            else:
+                break
+
+    if show:
+        print('add_first_rows match_row_list', match_row_list)
+
+    real_min_h = None
+    for mi, match_row in enumerate(match_row_list):
+        lt_text_row, is_right, min_h = match_row
+        lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
+        # 只有一列
+        if len(lt_text_row) == 1:
+            text = lt_text_row[0].get_text()
+            match = re.search('[::]+', text)
+            real_min_h = min_h
+            if not match:
+                head = ""
+                value = text
+            else:
+                head = text[:match.end()]
+                value = text[match.end():]
+        # # 或 两列,其实是表头由于空白被隔开
+        # elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
+        #         and lt_text_row[1].get_text()[-1] in [':', ":"]:
+        #     text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
+        #     head = text
+        #     value = ''
+        # # 两列
+        # elif len(lt_text_row) == 2:
+        #     text1 = lt_text_row[0].get_text()
+        #     match = re.search('[::]+', text1)
+        #     if not match:
+        #         break
+        #     real_max_h = max_h
+        #     head = text1
+        #     value = lt_text_row[1].get_text()
+        else:
+            if show:
+                print('add_first_rows len(lt_text_row) break', len(lt_text_row))
+            break
+
+        # 获取表格第一行,可能需要将值补进去
+        if not head and value:
+            if is_right:
+                b_table[0][3] = value + b_table[0][3]
+            else:
+                b_table[0][1] = value + b_table[0][1]
+
+    if real_min_h is not None:
+        table_bbox[1] = real_min_h
+    return b_table
+
+
+def get_row_bbox(row, mode='list'):
+    # 提取所有x1, y1, x2, y2的值
+
+    if mode == 'list':
+        x1_values = [x[0] for x in row]
+        y1_values = [x[1] for x in row]
+        x2_values = [x[2] for x in row]
+        y2_values = [x[3] for x in row]
+    elif mode == '.bbox':
+        x1_values = [x.bbox[0] for x in row]
+        y1_values = [x.bbox[1] for x in row]
+        x2_values = [x.bbox[2] for x in row]
+        y2_values = [x.bbox[3] for x in row]
+
+    min_x = min(x1_values)
+    max_x = max(x2_values)
+    min_y = min(y1_values)
+    max_y = max(y2_values)
+    return min_x, min_y, max_x, max_y
+
+
+def shrink_bbox(img, bbox_list):
+    def return_not_most_color_index(image_np, match_color):
+        # 计算每个像素与背景色的欧几里得距离的平方
+        diff = np.sum(np.sqrt((image_np.astype(np.int32) - match_color.astype(np.int32)) ** 2), axis=2)
+        threshold = 100  # 假设阈值为 10000,可以调整
+        diff_mask = diff > threshold
+        # 获取与背景色相差较大的像素的索引
+        diff_index = np.where(diff_mask)
+        # print('diff_index.size', diff_index[0].size)
+        return diff_index
+
+    def return_not_most_color_index_fast(image_np, match_color):
+        # 将图像和匹配颜色转换为整数类型
+        # image_int = image_np.astype(np.int32)
+        # match_color_int = match_color.astype(np.int32)
+
+        # 计算每个像素与背景色的欧几里得距离的平方
+        diff = np.sum((image_np - match_color) ** 2, axis=2)
+        threshold = 20 # 假设阈值为 10000,可以调整
+        threshold = threshold ** 2
+        diff_mask = diff > threshold
+        # 获取与背景色相差较大的像素的索引
+        diff_index = np.where(diff_mask)
+        # print('diff_index.size', diff_index[0].size)
+        return diff_index
+
+
+    # def count_colors_with_histogram(img):
+    #     time00 = time.time()
+    #
+    #     # 计算每个颜色通道的直方图
+    #     hist_b = cv2.calcHist([img], [0], None, [256], [0, 256])
+    #     hist_g = cv2.calcHist([img], [1], None, [256], [0, 256])
+    #     hist_r = cv2.calcHist([img], [2], None, [256], [0, 256])
+    #
+    #     # 将直方图合并成一个数组
+    #     hist = np.concatenate((hist_b.flatten(), hist_g.flatten(), hist_r.flatten()))
+    #
+    #     # 获取非零值的索引及其数量
+    #     non_zero_indices = np.nonzero(hist)[0]
+    #     counts = hist[non_zero_indices]
+    #
+    #     # 将索引转换为颜色值
+    #     colors = np.unravel_index(non_zero_indices, (256, 256, 256))
+    #     colors = np.transpose(colors)
+    #
+    #     log("count_colors_with_histogram Time taken: " + str(time.time() - time00))
+    #     return colors, counts
+    #
+    #
+    # def count_colors_with_kmeans(img):
+    #     time00 = time.time()
+    #     img_color = img.reshape(-1, 3)
+    #
+    #     # 使用 KMeans 聚类,将颜色聚类为 16 种
+    #     kmeans = KMeans(n_clusters=4, random_state=0, n_init=2, max_iter=10)
+    #     kmeans.fit(img_color)
+    #
+    #     # 获取聚类后的标签和中心
+    #     labels = kmeans.labels_
+    #     centers = kmeans.cluster_centers_
+    #
+    #     # 统计每个聚类中心的数量
+    #     unique_labels, counts = np.unique(labels, return_counts=True)
+    #
+    #     print("Time taken: ", time.time() - time00)
+    #     return centers[unique_labels], counts
+    #
+    # def count_colors_with_bincount(img):
+    #     time00 = time.time()
+    #     img_color = img.reshape(-1, 3)
+    #
+    #     # 将颜色编码为一个整数
+    #     colors_encoded = img_color[:, 0] * 256 * 256 + img_color[:, 1] * 256 + img_color[:, 2]
+    #
+    #     # 使用 bincount 计算每个颜色的数量
+    #     counts = np.bincount(colors_encoded)
+    #
+    #     # 获取非零值的索引及其数量
+    #     non_zero_indices = np.nonzero(counts)[0]
+    #
+    #     # 解码颜色值
+    #     colors_decoded = []
+    #     for index in non_zero_indices:
+    #         r = (index // (256 * 256)) % 256
+    #         g = (index // 256) % 256
+    #         b = index % 256
+    #         colors_decoded.append([r, g, b])
+    #
+    #     colors_decoded = np.array(colors_decoded)
+    #     counts_non_zero = counts[non_zero_indices]
+    #
+    #     print("Time taken: ", time.time() - time00)
+    #     return colors_decoded, counts_non_zero
+
+    # 统计每种颜色的出现次数
+    # time00 = time.time()
+
+    # 对图像进行降采样
+
+    time0 = time.time()
+    down_sample_factor = 8
+    down_sampled_img = img[::down_sample_factor, ::down_sample_factor, :]
+    down_sampled_img_color = down_sampled_img.reshape(-1, 3)
+    colors, counts = np.unique(down_sampled_img_color, return_counts=True, axis=0)
+    log('shrink_bbox 0 ' + str(time.time()-time0))
+
+    # 找到出现次数最多的颜色
+    time0 = time.time()
+    max_count_index = np.argmax(counts)
+    most_frequent_color = colors[max_count_index]
+    most_frequent_color = most_frequent_color.astype(np.int32)
+    log('shrink_bbox 1 ' + str(time.time()-time0))
+
+    new_bbox_list = []
+    img_int = img.astype(np.int32)
+    time0 = time.time()
+    for bbox in bbox_list:
+        # img_bbox = img[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
+        # img_bbox = img[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
+        img_bbox_int = img_int[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
+
+        if 0 in img_bbox_int.shape:
+            new_bbox_list.append(bbox)
+            continue
+
+        # 左右上下开始扫描,碰到黑像素即停
+        # index_list = return_first_black_index(img_bbox[:, :, :])
+        index_list = return_not_most_color_index_fast(img_bbox_int, most_frequent_color)
+
+        if index_list[0].size == 0 or index_list[1].size == 0:
+            new_bbox_list.append(bbox)
+            continue
+        min_h = index_list[0][0]
+        max_h = index_list[0][-1]
+
+        img_bbox1 = np.swapaxes(img_bbox_int, 0, 1)
+        # index_list = return_first_black_index(img_bbox1[:, :, :])
+        index_list = return_not_most_color_index_fast(img_bbox1, most_frequent_color)
+
+        if index_list[0].size == 0 or index_list[1].size == 0:
+            new_bbox_list.append(bbox)
+            continue
+        min_w = index_list[0][0]
+        max_w = index_list[0][-1]
+
+        real_min_w = bbox[0] + min_w
+        real_max_w = bbox[0] + max_w
+        real_min_h = bbox[1] + min_h
+        real_max_h = bbox[1] + max_h
+        new_bbox = [real_min_w, real_min_h, real_max_w, real_max_h]
+        new_bbox_list.append(new_bbox)
+
+        # cv2.imshow('img', img_bbox)
+        # cv2.imshow('shrink', img[int(new_bbox[0][1]):int(new_bbox[2][1]), int(new_bbox[0][0]):int(new_bbox[2][0]), :])
+        # cv2.waitKey(0)
+    log('shrink_bbox 2 ' + str(time.time() - time0))
+    return new_bbox_list
+
+
+def shrink_bbox_by_pixel(lt_text_list):
+    for lt_text in lt_text_list:
+        bbox = lt_text.bbox
+        bbox_h = abs(bbox[3] - bbox[1])
+        shrink_h = bbox_h / 2
+        new_bbox = [bbox[0], int(bbox[1] + shrink_h / 2),
+                    bbox[2], int(bbox[3] - shrink_h / 2)
+                    ]
+        lt_text.bbox = new_bbox
+    return lt_text_list
+
+
+def get_inter_part(bbox_list, show=0):
+    if not bbox_list:
+        return None
+
+    # xs = [[x[0], x[2]] for x in bbox_list]
+    # xs = [y for x in xs for y in x]
+    #
+    # ys = [[x[1], x[3]] for x in bbox_list]
+    # ys = [y for x in ys for y in x]
+    #
+    # xs.sort(key=lambda x: x)
+    # ys.sort(key=lambda x: x)
+    #
+    # max_index = len(bbox_list)
+    # min_index = max_index - 1
+    #
+    # min_x, max_x = xs[min_index], xs[max_index]
+    # min_y, max_y = ys[min_index], ys[max_index]
+
+    # min_x, min_y, max_x, max_y = bbox_list[0]
+    # for bbox in bbox_list:
+    #     # if min_x < bbox[0]:
+    #     #     min_x = bbox[0]
+    #     # if min_y < bbox[1]:
+    #     #     min_y = bbox[1]
+    #     # if max_x > bbox[2]:
+    #     #     max_x = bbox[2]
+    #     # if max_y > bbox[3]:
+    #     #     max_y = bbox[3]
+    #     if min_x < min(bbox[0], bbox[2]):
+    #         min_x = min(bbox[0], bbox[2])
+    #     if min_y < min(bbox[1], bbox[3]):
+    #         min_y = min(bbox[1], bbox[3])
+    #     if max_x > max(bbox[0], bbox[2]):
+    #         max_x = max(bbox[0], bbox[2])
+    #     if max_y > max(bbox[1], bbox[3]):
+    #         max_y = max(bbox[1], bbox[3])
+    #     # print('min_x, min_y, max_x, max_y', min_x, min_y, max_x, max_y)
+    # _min_x = min(min_x, max_x)
+    # _max_x = max(min_x, max_x)
+    # _min_y = min(min_y, max_y)
+    # _max_y = max(min_y, max_y)
+
+    # # 同一行的bbox去重,取最大的
+    # # used_bbox_list = []
+    # current_bbox = bbox_list[0]
+    # delete_bbox_list = []
+    # bbox_list.sort(key=lambda x: (x[1], x[3]))
+    # threshold = 5
+    # for bbox in bbox_list:
+    #     if bbox == current_bbox:
+    #         continue
+    #     if current_bbox in delete_bbox_list:
+    #         current_bbox = bbox
+    #         continue
+    #     if current_bbox[1] - threshold <= bbox[1] <= bbox[3] <= current_bbox[3] + threshold:
+    #         if abs(current_bbox[0] - current_bbox[2]) > abs(bbox[0] - bbox[2]):
+    #             delete_bbox_list.append(bbox)
+    #         else:
+    #             delete_bbox_list.append(current_bbox)
+    #     else:
+    #         current_bbox = bbox
+    #
+    # for bbox in delete_bbox_list:
+    #     if bbox in bbox_list:
+    #         bbox_list.remove(bbox)
+
+    bbox_list.sort(key=lambda x: (x[0], x[2]))
+    min_x, min_y, max_x, max_y = bbox_list[0]
+    for bbox in bbox_list:
+        if min_x < bbox[0]:
+            min_x = bbox[0]
+        if min_y < bbox[1]:
+            min_y = bbox[1]
+        if max_x > bbox[2]:
+            max_x = bbox[2]
+        if max_y > bbox[3]:
+            max_y = bbox[3]
+    _min_x = min(min_x, max_x)
+    _max_x = max(min_x, max_x)
+    _min_y = min(min_y, max_y)
+    _max_y = max(min_y, max_y)
+    if show:
+        print('get_inter_part', [_min_x, _min_y, _max_x, _max_y])
+    return [_min_x, _min_y, _max_x, _max_y]
+
+
+def get_inter_part_250530(bbox_list, show=0):
+    if not bbox_list:
+        return None
+
+    x1_list = [x[0] for x in bbox_list]
+    x2_list = [x[2] for x in bbox_list]
+    y1_list = [x[1] for x in bbox_list]
+    y2_list = [x[3] for x in bbox_list]
+
+    x1_list.sort(key=lambda x: x, reverse=True)
+    x2_list.sort(key=lambda x: x)
+
+
+def get_straight_lines_from_image(image_np, threshold=50):
+    # 读取图像
+    if image_np is None:
+        print("无法读取图像")
+        return False
+
+    # 转换为灰度图像
+    gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
+
+    # 使用Canny算子进行边缘检测
+    edges = cv2.Canny(gray, 20, 150)
+
+    cv2.imshow('edges', edges)
+
+    # 使用霍夫直线变换检测直线
+    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold,
+                            minLineLength=50, maxLineGap=2)
+
+    for line in lines:
+        line = line[0]
+        print('line', line)
+        cv2.line(image_np, line[:2], line[2:], (0, 0, 255))
+
+    cv2.imshow('img', image_np)
+    cv2.waitKey(0)
+
+    print('lines', lines)
+
+
+def get_table_bbox(table):
+    x1 = min([y.bbox[0] for x in table for y in x])
+    y1 = min([y.bbox[1] for x in table for y in x])
+    x2 = max([y.bbox[2] for x in table for y in x])
+    y2 = max([y.bbox[3] for x in table for y in x])
+    return [x1, y1, x2, y2]
+
+
+@memory_decorator
+def merge_intersecting_lists(lists):
+    merged_lists = []
+    for current_list in lists:
+        # 当前列表转换为集合,方便后续操作
+        current_set = set(current_list)
+        merged = False
+        # 遍历已合并的列表,检查是否有交集
+        for i in range(len(merged_lists)):
+            merged_set = set(merged_lists[i])
+            # 如果存在交集
+            if current_set & merged_set:
+                # 合并两个列表,并去重
+                merged_lists[i] = list(merged_set.union(current_set))
+                merged = True
+                break
+        # 如果没有与任何已合并列表交集,则添加为新的合并列表
+        if not merged:
+            merged_lists.append(current_list.copy())
+    return merged_lists
+
+
+def merge_same_bbox(lt_text_list, avg_char_width, show=0):
+    from format_convert.convert_tree import TextBox
+    for i in range(len(lt_text_list)):
+        lt_text1 = lt_text_list[i]
+        line1_x = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
+        line1_y = ((lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0))
+
+        for j in range(i+1, len(lt_text_list)):
+            lt_text2 = lt_text_list[j]
+            # if lt_text1 == lt_text2:
+            #     continue
+            if lt_text1.bbox[2] >= lt_text2.bbox[0]:
+                continue
+
+            # x轴上不相交
+            line2_x = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
+            if line_iou(line1_x, line2_x) > 0:
+                continue
+
+            # y轴上iou大于一定值
+            line2_y = ((lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0))
+            if line_iou(line1_y, line2_y) > 0.9 \
+                    and abs(lt_text1.bbox[2] - lt_text2.bbox[0]) < avg_char_width * 5 \
+                    and re.search('[::]', lt_text2.get_text()) \
+                    and not re.search('[::]', lt_text1.get_text()) \
+                    and len(lt_text1.get_text()) <= 2:
+                new_lt_text = TextBox(text=lt_text1.get_text() + lt_text2.get_text(),
+                                      bbox=[lt_text1.bbox[0], min(lt_text1.bbox[1], lt_text2.bbox[1]),
+                                            lt_text2.bbox[2], max(lt_text1.bbox[3], lt_text2.bbox[3])
+                                            ])
+                lt_text_list[i] = new_lt_text
+                lt_text_list[j] = new_lt_text
+                if show:
+                    print('new_lt_text', new_lt_text)
+
+    lt_text_list = list(set(lt_text_list))
+    lt_text_list.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
+
+    return lt_text_list
+
+
+def sort_by_read_order(lt_text_list, threshold=10):
+    if not lt_text_list:
+        return lt_text_list
+
+    # 按 y1 升序排序
+    lt_text_list.sort(key=lambda x: x.bbox[1])
+
+    # 初始化变量
+    sorted_lt_text_list = []
+    current_row = [lt_text_list[0]]
+
+    for i in range(1, len(lt_text_list)):
+        # 如果当前边界框的 y1 与前一个边界框的 y1 差距小于阈值,认为是同一行
+        if abs(lt_text_list[i].bbox[1] - lt_text_list[i - 1].bbox[1]) < threshold:
+            current_row.append(lt_text_list[i])
+        else:
+            # 对当前行按 x1 排序并添加到结果中
+            current_row.sort(key=lambda x: x.bbox[0])
+            sorted_lt_text_list += current_row
+            current_row = [lt_text_list[i]]
+
+    # 添加最后一行
+    current_row.sort(key=lambda x: x.bbox[0])
+    sorted_lt_text_list += current_row
+    return sorted_lt_text_list
+
+
+def delete_empty_bbox(lt_text_list, show=0):
+    temp_list = []
+    for lt_text in lt_text_list:
+        if lt_text.get_text() in [':', ":", ";", ";"] \
+                or re.sub('\s', '', lt_text.get_text()) == "":
+            continue
+        temp_list.append(lt_text)
+    lt_text_list = temp_list
+    return lt_text_list
+
+
+def standard_table(table, show=0):
+    if not table:
+        return table
+
+    # 去掉占位符
+    for ri, row in enumerate(table):
+        for ci, col in enumerate(row):
+            if '@@:' in col.get('text'):
+                col['text'] = re.sub('@@:', '', col.get('text'))
+
+    # 修复一些表头冒号ocr提取不到被作为值的问题
+    for ri, row in enumerate(table):
+        if row[0].get('text') == '' and row[1].get('text') != '' and row[2].get('text') != '' and row[3].get('text') == '':
+            row[0]['text'] = row[1].get('text')
+            row[1]['text'] = ''
+            if show:
+                print('standard_table, add colon head', table[ri])
+
+    # 修复表头值上下错位的情况
+    # head          head
+    #       value           value
+    delete_row_index_list = []
+    for ri, row in enumerate(table):
+        if ri == 0:
+            continue
+        last_row = table[ri - 1]
+        if last_row[0].get('text') != '' and last_row[1].get('text') == '' \
+                and row[0].get('text') == '' and row[1].get('text') != '' \
+                and last_row[2].get('text') != '' and last_row[3].get('text') == '' \
+                and row[2].get('text') == '' and row[3].get('text') != '':
+            # 补上表头
+            row[0]['text'] = last_row[0].get('text')
+            row[2]['text'] = last_row[2].get('text')
+            delete_row_index_list.append(ri - 1)
+            if show:
+                print('standard_table, fix head value 1', table[ri])
+
+    temp_list = []
+    for ri, row in enumerate(table):
+        if ri in delete_row_index_list:
+            continue
+        temp_list.append(row)
+    table = temp_list
+
+    # 修复值未被合进上一行的情况
+    # head  value   head    value
+    #       value           value
+    delete_row_index_list = []
+    for ri, row in enumerate(table):
+        if ri == 0:
+            continue
+        last_row = table[ri - 1]
+        if last_row[0].get('text') != '' and last_row[1].get('text') != '' \
+                and row[0].get('text') == '' and row[1].get('text') != '' \
+                and last_row[2].get('text') != '' and last_row[3].get('text') != '' \
+                and row[2].get('text') == '' and row[3].get('text') != '':
+            # 补上值
+            last_row[1]['text'] += row[1]['text']
+            last_row[3]['text'] += row[3]['text']
+            delete_row_index_list.append(ri)
+    temp_list = []
+    for ri, row in enumerate(table):
+        if ri in delete_row_index_list:
+            continue
+        temp_list.append(row)
+    table = temp_list
+    return table
+
+
+@memory_decorator
+def find_outline_lt_text(lt_text_list, show=0):
+    lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
+    used_lt_text_list = []
+    row_list = []
+    for lt_text1 in lt_text_list:
+        if lt_text1 in used_lt_text_list:
+            continue
+        row = [lt_text1]
+        used_lt_text_list.append(lt_text1)
+        for lt_text2 in lt_text_list:
+            if lt_text2 in used_lt_text_list:
+                continue
+            line1 = [(lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0)]
+            line2 = [(lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0)]
+            if line_iou(line1, line2) > 0:
+                row.append(lt_text2)
+                used_lt_text_list.append(lt_text2)
+        row_list.append(row)
+
+    outline_lt_text_list = []
+    for row in row_list:
+        if len(row) >= 2:
+            continue
+        outline_lt_text_list += row
+
+    if show:
+        print('outline_lt_text_list', outline_lt_text_list)
+    return outline_lt_text_list
+
+
+def get_iou(bbox1, bbox2):
+    # 提取边界框的坐标
+    x1_1, y1_1, x2_1, y2_1 = bbox1
+    x1_2, y1_2, x2_2, y2_2 = bbox2
+
+    # 判断是否完全包含
+    if (x1_1 <= x1_2 and y1_1 <= y1_2 and x2_1 >= x2_2 and y2_1 >= y2_2) or \
+            (x1_2 <= x1_1 and y1_2 <= y1_1 and x2_2 >= x2_1 and y2_2 >= y2_1):
+        return 1.0
+
+    # 计算交集区域的坐标
+    inter_x1 = max(x1_1, x1_2)
+    inter_y1 = max(y1_1, y1_2)
+    inter_x2 = min(x2_1, x2_2)
+    inter_y2 = min(y2_1, y2_2)
+
+    # 计算交集区域的面积
+    inter_width = max(0, inter_x2 - inter_x1 + 1)
+    inter_height = max(0, inter_y2 - inter_y1 + 1)
+    inter_area = inter_width * inter_height
+
+    # 计算两个边界框的面积
+    bbox1_area = (x2_1 - x1_1 + 1) * (y2_1 - y1_1 + 1)
+    bbox2_area = (x2_2 - x1_2 + 1) * (y2_2 - y1_2 + 1)
+
+    # 计算并集区域的面积
+    union_area = bbox1_area + bbox2_area - inter_area
+
+    # 计算 IoU
+    iou = inter_area / union_area if union_area != 0 else 0
+
+    return iou
+
+
+def fix_cross_bbox(lt_text_list, show=0):
+    for lt_text1 in lt_text_list:
+        for lt_text2 in lt_text_list:
+            if lt_text1 == lt_text2:
+                continue
+            if get_iou(lt_text1.bbox, lt_text2.bbox) > 0:
+                if show:
+                    print('fix_cross_bbox1', lt_text1, lt_text2)
+                x10, x11, x12, x13 = lt_text1.bbox
+                x20, x21, x22, x23 = lt_text2.bbox
+
+                # 右侧相交,且交集不能过大,过大则不是这一维相交
+                if x10 < x20 < x12 and x12 - x20 < max(abs(x12 - x10), abs(x20 - x22)) / 2:
+                    x12 = min(lt_text1.bbox[2], lt_text2.bbox[0])
+                    x20 = max(lt_text1.bbox[2], lt_text2.bbox[0])
+
+                # 下方相交,且交集不能过大,过大则不是这一维相交
+                if x11 < x21 < x13 and x13 - x21 < max(abs(x13 - x11), abs(x21 - x23)) / 2:
+                    x13 = min(lt_text1.bbox[3], lt_text2.bbox[1])
+                    x21 = max(lt_text1.bbox[3], lt_text2.bbox[1])
+
+                lt_text1.bbox = [x10, x11, x12, x13]
+                lt_text2.bbox = [x20, x21, x22, x23]
+                if show:
+                    print('fix_cross_bbox2', lt_text1, lt_text2)
+    return lt_text_list
+
+
+def split_lt_text_by_many_space(lt_text_list, show=0):
+    from format_convert.convert_tree import TextBox
+
+    # 先处理前后空格
+    add_lt_text_list = []
+    delete_lt_text_list = []
+    for lt_text in lt_text_list:
+        text = lt_text.get_text()
+        bbox = lt_text.bbox
+
+        if len(text) == 0:
+            continue
+        text_unicode_len = get_char_unicode_length(text)
+        if text_unicode_len == 0:
+            continue
+        ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
+
+        space1 = re.findall('^[  ]+', text)
+        if space1:
+            space1 = ''.join(space1)
+            space1_unicode_len = get_char_unicode_length(space1)
+            space1_pixel_len = space1_unicode_len * ratio
+            text = re.sub('^[  ]+', '', text)
+            bbox = [bbox[0] + space1_pixel_len, bbox[1], bbox[2], bbox[3]]
+            if len(text) == 0:
+                continue
+            text_unicode_len = get_char_unicode_length(text)
+            if text_unicode_len == 0:
+                continue
+            ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
+
+        space2 = re.findall('[  ]+$', text)
+        if space2:
+            space2 = ''.join(space2)
+            space2_unicode_len = get_char_unicode_length(space2)
+            space2_pixel_len = space2_unicode_len * ratio
+            text = re.sub('[  ]+$', '', text)
+            bbox = [bbox[0], bbox[1], bbox[2] - space2_pixel_len, bbox[3]]
+            if len(text) == 0:
+                continue
+            text_unicode_len = get_char_unicode_length(text)
+            if text_unicode_len == 0:
+                continue
+            ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
+
+        if space1 or space2:
+            new_lt_text = TextBox(text=text, bbox=bbox)
+            add_lt_text_list.append(new_lt_text)
+            delete_lt_text_list.append(lt_text)
+
+    for lt_text in delete_lt_text_list:
+        if lt_text in lt_text_list:
+            lt_text_list.remove(lt_text)
+    lt_text_list += add_lt_text_list
+
+    # 处理表头中间隔着几个空格 电  话:        电  话:
+    add_lt_text_list = []
+    delete_lt_text_list = []
+    for lt_text in lt_text_list:
+        text = lt_text.get_text()
+        bbox = lt_text.bbox
+
+        if len(text) == 0:
+            continue
+
+        space_list = re.findall('[  ]+', text)
+        if len(space_list) >= 2:
+            space_list.sort(key=lambda x: len(x))
+            max_space = space_list[-1]
+            match = re.search(max_space, text)
+            if show:
+                print('max_space', max_space)
+                print('space_list', space_list)
+            if match:
+                part1 = text[:match.start()]
+                part2 = text[match.end():]
+                ss1 = re.split('[  ]+', part1)
+                ss2 = re.split('[  ]+', part2)
+
+                if len(ss1) == 2 and len(ss1[0]) == 1 and len(ss1[1]) == 2 and ss1[1][-1] in [':', ':'] \
+                        and len(ss2) == 2 and len(ss2[0]) == 1 and len(ss2[1]) == 2 and ss2[1][-1] in [':', ':']:
+                    new_text = ''.join(ss1) + max_space + ''.join(ss2)
+                    new_lt_text = TextBox(text=new_text, bbox=bbox)
+                    add_lt_text_list.append(new_lt_text)
+                    delete_lt_text_list.append(lt_text)
+
+    if show:
+        print('split_lt_text_by_many_space add_lt_text_list222', add_lt_text_list)
+        print('split_lt_text_by_many_space delete_lt_text_list222', delete_lt_text_list)
+
+    for lt_text in delete_lt_text_list:
+        if lt_text in lt_text_list:
+            lt_text_list.remove(lt_text)
+    lt_text_list += add_lt_text_list
+
+    # 处理中间多个空格,并拆分为两个
+    add_lt_text_list = []
+    delete_lt_text_list = []
+    for lt_text in lt_text_list:
+        text = lt_text.get_text()
+        bbox = lt_text.bbox
+
+        if len(text) == 0:
+            continue
+
+        text_unicode_len = get_char_unicode_length(text)
+        if text_unicode_len == 0:
+            continue
+        ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
+
+        # 中间有多个空格,且空格分割为两部分
+        match = re.search('[  ]{4,}', text)
+        ss = re.split('[  ]+', text)
+        if match and len(ss) == 2:
+            # if match:
+            part1 = text[:match.start()]
+            part2 = text[match.end():]
+
+            l1 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part1)
+            l2 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part2)
+            # 两边字符数都足够
+            if len(l1) >= 2 and len(l2) >= 2:
+                part1_unicode_len = get_char_unicode_length(part1)
+                part2_unicode_len = get_char_unicode_length(part2)
+
+                part1_pixel_len = ratio * part1_unicode_len
+                part2_pixel_len = ratio * part2_unicode_len
+
+                # avg_char_w = abs(bbox[0] - bbox[2]) / len(text)
+                bbox1 = [bbox[0], bbox[1], bbox[0] + part1_pixel_len, bbox[3]]
+                bbox2 = [bbox[2] - part2_pixel_len, bbox[1], bbox[2], bbox[3]]
+                # 用自己的对象新增
+                new_lt_text1 = TextBox(text=part1, bbox=bbox1)
+                new_lt_text2 = TextBox(text=part2, bbox=bbox2)
+                add_lt_text_list += [new_lt_text1, new_lt_text2]
+                delete_lt_text_list.append(lt_text)
+
+    for lt_text in delete_lt_text_list:
+        if lt_text in lt_text_list:
+            lt_text_list.remove(lt_text)
+    lt_text_list += add_lt_text_list
+
+    if show:
+        print('split_lt_text_by_many_space add_lt_text_list333', add_lt_text_list)
+        print('split_lt_text_by_many_space delete_lt_text_list333', delete_lt_text_list)
+
+    return lt_text_list
+
+
+def get_char_unicode_length(text, show=0):
+    # char_reg_len_dict = {
+    #     '[ ]': 1,
+    #     '[ ]': 1.5,
+    #     '[\u4e00-\u9fff]': 1.5,
+    #     '[a-zA-Z0-9#@,^.+=\(\)<>\-@#$%&*\[\]\'":;?~!’‘“”{}/]': 1,
+    #     '[:,。!¥……()【】;?《》、]': 1.5
+    # }
+    #
+    # text_real_len = 0
+    # for reg, char_len in char_reg_len_dict.items():
+    #     cs = re.findall(reg, text)
+    #     text_real_len += len(cs) * char_len
+    #
+    # real_avg_char_len = abs(bbox[2] - bbox[0]) / text_real_len
+    #
+    # char_reg_real_len_dict = {}
+    # for reg, char_len in char_reg_len_dict.items():
+    #     char_reg_real_len_dict[reg] = real_avg_char_len * char_len
+    #
+    # return char_reg_real_len_dict
+
+    width = wcwidth.wcswidth(text)
+    if show:
+        print('text unicode_length', text, width)
+    return width
+
+
+def fix_final_row(table, show=0):
+    # print('fix_final_row table', table)
+    if len(table) < 2:
+        return table
+    last_row = table[-2]
+    final_row = table[-1]
+    print('final_row', final_row)
+    print('last_row', last_row)
+    delete_final_flag = 0
+    if final_row[0] in ['', '@@:'] and final_row[1] in ['', '@@:'] \
+            and final_row[2] in ['', '@@:'] and final_row[3] not in ['', '@@:']:
+        table[-2][3] = final_row[3]
+        delete_final_flag = 1
+        if show:
+            print('fix_final_row right', table[-2])
+
+    if final_row[0] in ['', '@@:'] and final_row[1] not in ['', '@@:'] \
+            and final_row[2] in ['', '@@:'] and final_row[3] in ['', '@@:']:
+        table[-2][1] = final_row[1]
+        delete_final_flag = 1
+        if show:
+            print('fix_final_row left', table[-2])
+
+    if delete_final_flag:
+        table = table[:-1]
+
+    return table
+
+
+if __name__ == '__main__':
+    # from format_convert.convert_pdf import PDFConvert
+    # pdf_c = PDFConvert(None, None, None)
+    # from format_convert.convert_image import ImageProcess
+    # img_p = ImageProcess(None, None)
+    #
+    # ps = glob(r'D:\Project\format_conversion_maxcompute\save_b_table_not_detect\*')
+    # image_np_list = [[x, cv2.imread(x)] for x in ps]
+    # for p, image_np in image_np_list:
+    #     # 整体分辨率限制
+    #     image_np = img_p.resize_process(image_np)
+    #     # 文字识别
+    #     text_list, box_list = img_p.ocr_process(image_np)
+    #     # 转换为lt_text_box
+    #     _lt_text_list = text_bbox_to_lt(text_list, box_list)
+    # 先bbox预先判断可能有无边框
+    # _flag = judge_has_b_table_by_bbox(_lt_text_list, [], 0)
+    # print('path', p, 'has b table', _flag)
+
+    _pp = r'D:\Project\format_conversion_maxcompute\save_b_table\15-8292f767be81f404b813c119058a8a75.png'
+    img111 = cv2.imread(_pp)
+    img111 = pil_resize(img111, 1024, 768)
+    get_straight_lines_from_image(img111)
+    pass

+ 5 - 0
botr/utils.py

@@ -38,6 +38,11 @@ def request_post(url, param, time_out=1000, use_zlib=False):
 
 
 def line_iou(line1, line2, axis=0):
+    if line1[0][axis] <= line2[0][axis] <= line2[1][axis] <= line1[1][axis]:
+        return 1.
+    if line2[0][axis] <= line1[0][axis] <= line1[1][axis] <= line2[1][axis]:
+        return 1.
+
     inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
     # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
     union = min(abs(line1[0][axis]-line1[1][axis]), abs(line2[0][axis]-line2[1][axis]))

+ 1 - 1
config/interface_new.yml

@@ -58,7 +58,7 @@
 
     "tika": {
       "port": [ 16020 ],
-      "port_num": [ 2 ],
+      "port_num": [ 1 ],
       "gpu": [ -1 ]
     }
   },

+ 287 - 118
format_convert/convert.py

@@ -1,4 +1,4 @@
-#-*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
 import gc
 import json
 import sys
@@ -6,8 +6,20 @@ import os
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 # 强制tf使用cpu
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+# 动态添加 VERSION 属性到 Image 类
+import PIL
+from PIL import Image
+Image.VERSION = PIL.__version__
+
 from format_convert.utils import judge_error_code, request_post, get_intranet_ip, get_ip_port, get_logger, log, \
-    set_flask_global, get_md5_from_bytes, memory_decorator
+    set_flask_global, get_md5_from_bytes, memory_decorator, register_all_fonts
+
+# 调用函数注册字体
+# register_all_fonts("/usr/share/fonts/opentype/noto/")
+# register_all_fonts("/usr/share/fonts/truetype/arphic")
+# register_all_fonts("/usr/share/fonts/")
+
 from format_convert.convert_doc import doc2text, DocConvert
 from format_convert.convert_docx import docx2text, DocxConvert
 from format_convert.convert_image import picture2text, ImageConvert
@@ -18,6 +30,8 @@ from format_convert.convert_txt import txt2text, TxtConvert
 from format_convert.convert_xls import xls2text, XlsConvert
 from format_convert.convert_xlsx import xlsx2text, XlsxConvert
 from format_convert.convert_zip import zip2text, ZipConvert
+from format_convert.convert_wps import WpsConvert
+from format_convert.convert_ofd import OfdConvert
 from format_convert.convert_need_interface import from_atc_interface
 
 import hashlib
@@ -33,12 +47,28 @@ import logging
 from bs4 import BeautifulSoup
 from flask import Flask, request, g
 import inspect
+
 logging.getLogger("pdfminer").setLevel(logging.WARNING)
 from format_convert.table_correct import *
 from format_convert.wrapt_timeout_decorator import *
 from format_convert import _global
 from config.max_compute_config import MAX_COMPUTE
 
+support_file_types = [
+    'txt',
+    'pdf',
+    'doc',
+    'docx',
+    'xls',
+    'xlsx',
+    'zip',
+    'rar',
+    'jpg',
+    'png',
+    'jpeg',
+    'swf',
+    'wps',
+]
 
 if get_platform() == "Windows":
     globals().update({"time_out": 1000})
@@ -64,6 +94,9 @@ def getText(_type, path_or_stream, _page_no=None, time_out=300):
     except:
         unique_type_dir = path_or_stream + "_" + _type + os.sep
 
+    if not os.path.exists(unique_type_dir):
+        os.mkdir(unique_type_dir)
+
     if _type == "pdf":
         if MAX_COMPUTE:
             return PDFConvert(path_or_stream, unique_type_dir, _page_no).get_html()
@@ -102,11 +135,19 @@ def getText(_type, path_or_stream, _page_no=None, time_out=300):
         if MAX_COMPUTE:
             return TxtConvert(path_or_stream, unique_type_dir).get_html()
         return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
+    if _type == "wps":
+        if MAX_COMPUTE:
+            return WpsConvert(path_or_stream, unique_type_dir).get_html()
+        return get_html_1(WpsConvert(path_or_stream, unique_type_dir))
+    if _type == "ofd":
+        if MAX_COMPUTE:
+            return OfdConvert(path_or_stream, unique_type_dir).get_html()
+        return get_html_1(OfdConvert(path_or_stream, unique_type_dir))
     return [""]
 
 
 def to_html(path, text):
-    with open(path, 'w',encoding="utf8") as f:
+    with open(path, 'w', encoding="utf8") as f:
         f.write("<!DOCTYPE HTML>")
         f.write('<head><meta charset="UTF-8"></head>')
         f.write("<body>")
@@ -154,6 +195,11 @@ def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300, save_m
     if get_platform() == "Windows":
         _global._init()
 
+    if MAX_COMPUTE:
+        _path = "/home/admin"
+    else:
+        _path = os.path.dirname(os.path.abspath(__file__))
+
     globals().update({"md5": _md5})
     _global.update({"md5": _md5})
     log("into unique_temp_file_process")
@@ -247,7 +293,7 @@ def cut_str(text_list, only_text_list, max_bytes_length=2000000):
             return only_text_list
 
         # 截取字符
-        all_text = all_text[:int(max_bytes_length/3)]
+        all_text = all_text[:int(max_bytes_length / 3)]
         return [all_text]
     except Exception as e:
         log("cut_str " + str(e))
@@ -336,7 +382,7 @@ def convert_maxcompute(data, ocr_model, otr_model):
             print({"md5: ": str(_md5), "finished result": ["", 0], "is_success": 1}, time.time() - start_time)
         else:
             print("md5: " + str(_md5), {"finished result": [str(only_text)[:20], len(str(text))],
-                  "is_success": 1}, time.time() - start_time)
+                                        "is_success": 1}, time.time() - start_time)
         return {"result_html": text, "result_text": only_text, "is_success": 1}
     except Exception as e:
         print({"md5: ": str(_md5), "failed result": [-1], "is_success": 0}, time.time() - start_time)
@@ -350,6 +396,20 @@ app = Flask(__name__)
 
 @app.route('/convert', methods=['POST'])
 def _convert():
+    try:
+        data = request.form
+    except Exception:
+        log_convert_result("1" + "0" * 15, [-1], "", 0,
+                           None, None, time.time())
+        traceback.print_exc()
+        return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
+                           "is_success": 0, "swf_images": str([]),
+                           "classification": ""})
+    result = convert(data)
+    return result
+
+
+def _convert_old_250613():
     """
     接口返回值:
     {[str], 1}: 处理成功
@@ -377,11 +437,11 @@ def _convert():
     # snapshot = tracemalloc.take_snapshot()
 
     _global._init()
-    _global.update({"md5": "1"+"0"*15})
+    _global.update({"md5": "1" + "0" * 15})
     set_flask_global()
     # _global.update({"port": str(port)})
 
-    log("into convert")
+    log("into _convert")
     start_time = time.time()
     _md5 = _global.get("md5")
     _type = None
@@ -395,12 +455,12 @@ def _convert():
         file_path = data.get("file_path")
         if file_path is None:
             stream = base64.b64decode(data.get("file"))
-            log("get bytes from file " + str(time.time()-_time))
+            log("get bytes from file " + str(time.time() - _time))
         # 有路径则直接取路径打开文件
         else:
             with open(file_path, "rb") as f:
                 stream = f.read()
-            log("get bytes from file_path " + str(time.time()-_time))
+            log("get bytes from file_path " + str(time.time() - _time))
         _type = data.get("type")
         _md5 = get_md5_from_bytes(stream)
         _md5 = _md5[0]
@@ -427,7 +487,8 @@ def _convert():
             # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
             # text, swf_images = origin_unique_temp_file_process(stream, _type)
             try:
-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle)
+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
+                                                            time_out=globals().get('time_out'), save_middle=save_middle)
             except TimeoutError:
                 log("convert time out! 300 sec")
                 text = [-5]
@@ -435,7 +496,8 @@ def _convert():
         else:
             # Linux 通过装饰器设置整个转换超时时间
             try:
-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle)
+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
+                                                            time_out=globals().get('time_out'), save_middle=save_middle)
             except TimeoutError:
                 log("convert time out! 300 sec")
                 text = [-5]
@@ -447,11 +509,12 @@ def _convert():
                 is_success = 1
             else:
                 is_success = 0
-            log("md5: " + str(_md5)
-                         + " finished result: " + str(text)
-                         + " is_success: " + str(is_success) + " "
-                         + str(_type) + " "
-                         + " " + str(time.time() - start_time))
+            log("md5: " + str(_md5) + " "
+                + "finished result: " + str(text) + " "
+                + "is_success: " + str(is_success) + " "
+                + str(_type) + " "
+                + 'None '
+                + str(round(time.time() - start_time, 2)))
             return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
                                "is_success": is_success, "swf_images": str(swf_images)})
 
@@ -484,16 +547,17 @@ def _convert():
         if only_text[0] == '' and len(only_text) <= 1:
             print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
             log("md5: " + str(_md5) + " "
-                + " finished result: ['', 0] is_success: 1 "
+                + "finished result: ['', 0] is_success: 1 "
                 + str(_type) + " "
-                + str(time.time() - start_time))
+                + 'None '
+                + str(round(time.time() - start_time, 2)))
         else:
-            log("md5: " + str(_md5) +
-                " finished result: " + str(only_text)[:20] + " "
+            log("md5: " + str(_md5) + " "
+                + "finished result: " + str(only_text)[:20] + " "
                 + str(len(str(text))) + " is_success: 1 "
                 + str(_type) + " "
                 + str(classification) + " "
-                + str(time.time() - start_time))
+                + str(round(time.time() - start_time, 2)))
 
         # log("growth end" + str(objgraph.growth()))
         # log("most_common_types end" + str(objgraph.most_common_types(20)))
@@ -502,15 +566,24 @@ def _convert():
                            "classification": classification})
 
     except ConnectionError:
-        log("convert post has no data!" + " failed result: [-2] is_success: 0 "
-            + str(time.time() - start_time))
+        # log("convert post has no data!" + " failed result: [-2] is_success: 0 "
+        #     + str(round(time.time() - start_time, 2)))
+        log("md5: " + str(_md5) + " "
+            + "failed result: [-2] is_success: 0 "
+            + str(_type) + " "
+            + "None "
+            + str(round(time.time() - start_time, 2))
+            )
         return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
                            "is_success": 0, "swf_images": str([]),
                            "classification": ""})
     except Exception as e:
-        log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
-            + str(_type) + " " +
-            str(time.time() - start_time))
+        log("md5: " + str(_md5) + " "
+            + "failed result: [-1] is_success: 0 "
+            + str(_type) + " "
+            + "None "
+            + str(round(time.time() - start_time, 2))
+            )
         traceback.print_exc()
         return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
                            "is_success": 0, "swf_images": str([]),
@@ -545,6 +618,146 @@ def _convert():
 
 
 def convert(data):
+    """
+    接口返回值:
+    :return: {"result_html": [str], "result_text": [str],
+              "is_success": int, "swf_images": str(list)}
+    """
+    log("into convert")
+    start_time = time.time()
+
+    # 初始化
+    _global._init()
+    _global.update({"md5": "1" + "0" * 15})
+    set_flask_global()
+    # 文件md5
+    _md5 = _global.get("md5")
+    # 文件类型
+    _type = None
+    try:
+        if not data:
+            log("convert no data!")
+            raise ConnectionError
+
+        file_path = data.get("file_path")
+        if file_path is None:
+            stream = base64.b64decode(data.get("file"))
+            log("get bytes from file " + str(time.time() - start_time))
+        # 有路径则直接取路径打开文件
+        else:
+            with open(file_path, "rb") as f:
+                stream = f.read()
+            log("get bytes from file_path " + str(time.time() - start_time))
+
+        # 获取真实值
+        _type = data.get("type")
+        _md5 = get_md5_from_bytes(stream)
+        _md5 = _md5[0]
+        _global.update({"md5": _md5})
+
+        # 指定页码范围
+        _page_no = data.get('page_no')
+
+        # 指定timeout
+        _timeout = data.get('timeout')
+        if _timeout is not None:
+            globals().update({"time_out": _timeout})
+
+        # 是否保留中间文件
+        save_middle = data.get('save_middle')
+
+        # 最终结果截取的最大字节数
+        max_bytes = data.get("max_bytes")
+
+        # 开始转换,并且控制时间
+        try:
+            text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
+                                                        time_out=globals().get('time_out'), save_middle=save_middle)
+        except TimeoutError:
+            log("convert time out! 300 sec")
+            text = [-5]
+            swf_images = []
+
+        # 报错依然成功的
+        still_success_code = [-3, -4, -7]
+        if judge_error_code(text):
+            if judge_error_code(text, still_success_code):
+                is_success = 1
+            else:
+                is_success = 0
+            log_convert_result(_md5, text, "", is_success,
+                               _type, None, start_time)
+            return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
+                               "is_success": is_success, "swf_images": str(swf_images)})
+
+        # 结果保存result.html
+        text_str = ""
+        for t in text:
+            text_str += t
+        to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
+
+        # 取纯文本
+        only_text = []
+        for t in text:
+            new_t = BeautifulSoup(t, "lxml").get_text()
+            new_t = re.sub("\n", "", new_t)
+            only_text.append(new_t)
+
+        # 判断附件类型
+        classification = from_atc_interface(' '.join(only_text))
+        if judge_error_code(classification):
+            classification = [str(classification[0])]
+
+        # 判断长度,过长截取
+        text = cut_str(text, only_text, max_bytes)
+        only_text = cut_str(only_text, only_text)
+
+        if len(only_text) == 0:
+            only_text = [""]
+
+        if only_text[0] == '' and len(only_text) <= 1:
+            log_convert_result(_md5, '', '', 1,
+                               _type, None, start_time)
+        else:
+            log_convert_result(_md5, only_text, text, 1,
+                               _type, classification, start_time)
+        return json.dumps({"result_html": text, "result_text": only_text,
+                           "is_success": 1, "swf_images": str(swf_images),
+                           "classification": classification})
+
+    except ConnectionError:
+        log_convert_result(_md5, [-2], "", 0,
+                           _type, None, start_time)
+        return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
+                           "is_success": 0, "swf_images": str([]),
+                           "classification": ""})
+    except Exception:
+        log_convert_result(_md5, [-1], "", 0,
+                           _type, None, start_time)
+        traceback.print_exc()
+        return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
+                           "is_success": 0, "swf_images": str([]),
+                           "classification": ""})
+    finally:
+        pass
+        # log("finally")
+
+
+def log_convert_result(_md5, only_text, text, is_success, _type, _attach_class, start_time):
+    str_list = [
+        "md5: " + str(_md5),
+        "finished result: " + re.sub(' ', '', str(only_text)[:20]),
+        str(len(str(text))),
+        "is_success: " + str(is_success),
+        str(_type),
+        str(_attach_class),
+        str(round(time.time()-start_time, 3)),
+    ]
+    info = ' '.join(str_list)
+    log(info)
+
+
+def convert_old_250613(data):
     """
     接口返回值:
     {[str], 1}: 处理成功
@@ -558,7 +771,7 @@ def convert(data):
     :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
     """
     _global._init()
-    _global.update({"md5": "1"+"0"*15})
+    _global.update({"md5": "1" + "0" * 15})
     set_flask_global()
 
     log("into convert")
@@ -584,7 +797,8 @@ def convert(data):
             # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
             # text, swf_images = origin_unique_temp_file_process(stream, _type)
             try:
-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
+                                                            time_out=globals().get('time_out'))
             except TimeoutError:
                 log("convert time out! 300 sec")
                 text = [-5]
@@ -592,7 +806,8 @@ def convert(data):
         else:
             # Linux 通过装饰器设置整个转换超时时间
             try:
-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
+                                                            time_out=globals().get('time_out'))
             except TimeoutError:
                 log("convert time out! 300 sec")
                 text = [-5]
@@ -604,11 +819,12 @@ def convert(data):
                 is_success = 1
             else:
                 is_success = 0
-            log("md5: " + str(_md5)
-                + " finished result: " + str(text)
-                + " is_success: " + str(is_success) + " "
+            log("md5: " + str(_md5) + " "
+                + "finished result: " + str(text) + " "
+                + "is_success: " + str(is_success) + " "
                 + str(_type) + " "
-                + " " + str(time.time() - start_time))
+                + "None "
+                + str(round(time.time() - start_time, 2)))
             return {"result_html": [str(text[0])], "result_text": [str(text[0])],
                     "is_success": is_success, "swf_images": str(swf_images)}
 
@@ -639,18 +855,19 @@ def convert(data):
             only_text = [""]
 
         if only_text[0] == '' and len(only_text) <= 1:
-            print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
+            # print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
             log("md5: " + str(_md5) + " "
-                + " finished result: ['', 0] is_success: 1 "
+                + "finished result: ['', 0] is_success: 1 "
                 + str(_type) + " "
-                + str(time.time() - start_time))
+                + "None "
+                + str(round(time.time() - start_time, 2)))
         else:
-            log("md5: " + str(_md5) +
-                " finished result: " + str(only_text)[:20] + " "
+            log("md5: " + str(_md5) + " "
+                + "finished result: " + str(only_text)[:20] + " "
                 + str(len(str(text))) + " is_success: 1 "
                 + str(_type) + " "
                 + str(classification) + " "
-                + str(time.time() - start_time))
+                + str(round(time.time() - start_time, 2)))
 
         return {"result_html": text, "result_text": only_text,
                 "is_success": 1, "swf_images": str(swf_images),
@@ -658,7 +875,7 @@ def convert(data):
 
     except ConnectionError:
         log("convert post has no data!" + " failed result: [-2] is_success: 0 "
-            + str(time.time() - start_time))
+            + str(round(time.time() - start_time, 2)))
         return {"result_html": ["-2"], "result_text": ["-2"],
                 "is_success": 0, "swf_images": str([]),
                 "classification": ""}
@@ -689,7 +906,7 @@ def convert_old(data, ocr_model, otr_model):
     """
     log("into convert")
     _global._init()
-    _global.update({"md5": "1"+"0"*15})
+    _global.update({"md5": "1" + "0" * 15})
     # set_flask_global()
 
     start_time = time.time()
@@ -706,7 +923,7 @@ def convert_old(data, ocr_model, otr_model):
         _md5 = get_md5_from_bytes(stream)
         _md5 = _md5[0]
         _global.update({"md5": _md5})
-        log("get bytes from file " + str(time.time()-_time))
+        log("get bytes from file " + str(time.time() - _time))
 
         if get_platform() == "Windows":
             try:
@@ -730,11 +947,12 @@ def convert_old(data, ocr_model, otr_model):
                 is_success = 1
             else:
                 is_success = 0
-            log("md5: " + str(_md5)
-                + " finished result: " + str(text)
-                + " is_success: " + str(is_success) + " "
+            log("md5: " + str(_md5) + " "
+                + "finished result: " + str(text) + " "
+                + "is_success: " + str(is_success) + " "
                 + str(_type) + " "
-                + " " + str(time.time() - start_time))
+                + "None "
+                + str(round(time.time() - start_time, 2)))
             return {"result_html": [str(text[0])], "result_text": [str(text[0])],
                     "is_success": is_success, "swf_images": str(swf_images)}
 
@@ -761,22 +979,24 @@ def convert_old(data, ocr_model, otr_model):
         if only_text[0] == '' and len(only_text) <= 1:
             print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
             log("md5: " + str(_md5) + " "
-                + " finished result: ['', 0] is_success: 1 "
+                + "finished result: ['', 0] is_success: 1 "
                 + str(_type) + " "
-                + str(time.time() - start_time))
+                + "None "
+                + str(round(time.time() - start_time, 2)))
         else:
-            log("md5: " + str(_md5) +
-                " finished result: " + str(only_text)[:20] + " "
+            log("md5: " + str(_md5) + " "
+                + "finished result: " + str(only_text)[:20] + " "
                 + str(len(str(text))) + " is_success: 1 "
                 + str(_type) + " "
-                + str(time.time() - start_time))
+                + "None "
+                + str(round(time.time() - start_time, 2)))
 
         return {"result_html": text, "result_text": only_text,
                 "is_success": 1, "swf_images": str(swf_images)}
 
     except ConnectionError:
         log("convert post has no data!" + " failed result: [-2] is_success: 0 "
-            + str(time.time() - start_time))
+            + str(round(time.time() - start_time, 2)))
         return {"result_html": ["-2"], "result_text": ["-2"],
                 "is_success": 0, "swf_images": str([])}
     except Exception as e:
@@ -801,9 +1021,9 @@ def test_more(_dir, process_no=None):
     for p in file_path_list:
         if i % 10 == 0:
             if process_no is not None:
-                print("Process", process_no, i, time.time()-start_time)
+                print("Process", process_no, i, time.time() - start_time)
             else:
-                print("Loop", i, time.time()-start_time)
+                print("Loop", i, time.time() - start_time)
         test_one(p, from_remote=True)
         i += 1
 
@@ -847,79 +1067,28 @@ def test_duplicate(path_list, process_no=None):
     for i in range(500):
         if i % 10 == 0:
             if process_no is not None:
-                print("Process", process_no, i*len(path_list), time.time()-start_time)
+                print("Process", process_no, i * len(path_list), time.time() - start_time)
             else:
-                print("Loop", i*len(path_list), time.time()-start_time)
+                print("Loop", i * len(path_list), time.time() - start_time)
         for p in path_list:
             test_one(p, from_remote=True)
 
 
-global_type = ""
-local_url = "http://127.0.0.1"
-if get_platform() == "Windows":
-    _path = os.path.abspath(os.path.dirname(__file__))
-else:
-    _path = "/home/admin"
-    if not os.path.exists(_path):
-        _path = os.path.dirname(os.path.abspath(__file__))
+# global_type = ""
+# local_url = "http://127.0.0.1"
+# if get_platform() == "Windows":
+#     _path = os.path.abspath(os.path.dirname(__file__))
+# else:
+#     _path = "/home/admin"
+#     if not os.path.exists(_path):
+#         _path = os.path.dirname(os.path.abspath(__file__))
 
 
 if __name__ == '__main__':
-    # convert interface
-    if len(sys.argv) == 2:
-        port = int(sys.argv[1])
-    else:
-        port = 15010
-
-    globals().update({"md5": "1"+"0"*15})
+    port = 15010
+    globals().update({"md5": "1" + "0" * 15})
     globals().update({"port": str(port)})
-    # _global._init()
-    # _global.update({"md5": "1"+"0"*15})
-    # _global.update({"port": str(port)})
-
-    # ip = get_intranet_ip()
-    # log("my ip"+str(ip))
-    # ip = "http://" + ip
     ip_port_dict = get_ip_port()
-
     set_flask_global()
+    app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
 
-    if get_platform() == "Windows":
-        app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
-    else:
-        # app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
-        app.run(port=15011)
-
-    # if get_platform() == "Windows":
-    #     file_path = "C:/Users/Administrator/Desktop/test_image/error29.png"
-    #     # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
-    #     # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
-    #     # file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
-    # else:
-    #     file_path = "test1.doc"
-    # test_one(file_path, from_remote=True)
-
-    # if get_platform() == "Windows":
-    #     file_dir = "D:/BIDI_DOC/比地_文档/table_images/"
-    # else:
-    #     file_dir = "../table_images/"
-    #
-    # for j in range(10):
-    #     p = Process(target=test_more, args=(file_dir, j, ))
-    #     p.start()
-    # p.join()
-
-    # if get_platform() == "Windows":
-    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
-    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls",
-    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/11111111.rar"]
-    #     file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
-    #                       "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"]
-    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc"]
-    #
-    # else:
-    #     file_path_list = ["test1.pdf"]
-    # for j in range(10):
-    #     p = Process(target=test_duplicate, args=(file_path_list, j, ))
-    #     p.start()
-    # p.join()

+ 94 - 22
format_convert/convert_doc.py

@@ -6,7 +6,7 @@ import sys
 import chardet
 from bs4 import BeautifulSoup
 sys.path.append(os.path.dirname(__file__) + "/../")
-from format_convert.convert_tree import _Document, _Sentence, _Page
+from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
 import logging
 import traceback
 from format_convert import get_memory_info
@@ -35,11 +35,71 @@ def doc2text(path, unique_type_dir):
 class DocConvert:
     def __init__(self, path, unique_type_dir):
         self._doc = _Document(path)
+        self._page = _Page(None, 0)
         self.path = path
         self.unique_type_dir = unique_type_dir
         self.tika_html = None
+        print('into DocConvert __init__')
 
     def convert(self):
+        print('into DocConvert convert')
+        # 先判断特殊doc文件,可能是html文本
+        # is_html_doc = False
+        # try:
+        #     try:
+        #         with open(self.path, 'r') as f:
+        #             html_str = f.read()
+        #     except UnicodeDecodeError:
+        #         with open(self.path, 'r', errors='ignore') as f:
+        #             html_str = f.read()
+        #     # if re.search('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str):
+        #     if len(re.findall('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str)) >= 10:
+        #         log('doc as html!')
+        #         soup = BeautifulSoup(html_str, 'lxml')
+        #         text = soup.text
+        #         is_html_doc = True
+        # except:
+        #     pass
+        #
+        # if is_html_doc:
+        #     self._page = _Page(None, 0)
+        #     _sen = _Sentence(text, (0, 0, 0, 0))
+        #     self._page.add_child(_sen)
+        #     self._doc.add_child(self._page)
+
+        # 先判断特殊doc文件,可能是html文本
+        is_html_doc = self.maybe_html()
+
+        if not is_html_doc:
+            # 调用office格式转换
+            file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
+            if judge_error_code(file_path):
+                # office转换失败,调用tika,提取各个类型对象
+                try:
+                    self.use_tika(self.path)
+                except:
+                    traceback.print_exc()
+                    self._doc.error_code = [-17]
+                    log('doc tika failed too')
+                return
+
+            _docx = DocxConvert(file_path, self.unique_type_dir)
+            _docx.convert()
+            self._doc = _docx._doc
+            # if self._doc.error_code is not None:
+            #     # docx提取失败,调用tika,提取各个类型对象
+            #     print('DocxConvert failed use_tika')
+            #     self.use_tika(self.path)
+            #     self._doc.error_code = None
+            #     # # 调用tika提取
+            #     # html = from_tika_interface(self.path)
+            #     # if judge_error_code(html):
+            #     #     self._doc.error_code = html
+            #     # self.tika_html = html
+            #     # self._doc.error_code = None
+            #     return
+
+    def maybe_html(self):
         # 先判断特殊doc文件,可能是html文本
         is_html_doc = False
         try:
@@ -63,27 +123,39 @@ class DocConvert:
             _sen = _Sentence(text, (0, 0, 0, 0))
             self._page.add_child(_sen)
             self._doc.add_child(self._page)
-        else:
-            # 调用office格式转换
-            file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
-            if judge_error_code(file_path):
-                # 调用tika提取
-                html = from_tika_interface(self.path)
-                if judge_error_code(html):
-                    self._doc.error_code = html
-                self.tika_html = html
-                return
-            _docx = DocxConvert(file_path, self.unique_type_dir)
-            _docx.convert()
-            self._doc = _docx._doc
-            if self._doc.error_code is not None:
-                # 调用tika提取
-                html = from_tika_interface(self.path)
-                if judge_error_code(html):
-                    self._doc.error_code = html
-                self.tika_html = html
-                self._doc.error_code = None
-                return
+
+        return is_html_doc
+
+    def use_tika(self, _path):
+        # 调用tika提取
+        # html = from_tika_interface(self.path)
+        # if judge_error_code(html):
+        #     self._doc.error_code = html
+        # self.tika_html = html
+        data = from_tika_interface(_path)
+        if judge_error_code(data):
+            self._doc.error_code = data
+            return
+        current_y = 5
+        for di, d in enumerate(data):
+            data_type, value = d
+            bbox = [0, current_y, 20, current_y+10]
+            current_y += 20
+            if data_type == 'text':
+                _sen = _Sentence(value, bbox)
+                _sen.combine = False
+                self._page.add_child(_sen)
+            elif data_type == 'img':
+                with open(value, "rb") as f:
+                    img = f.read()
+                _img = _Image(img, value, bbox)
+                _img.is_from_docx = True
+                self._page.add_child(_img)
+            elif data_type == 'table':
+                _table = _Table(value, bbox)
+                _table.is_html = True
+                self._page.add_child(_table)
+        self._doc.add_child(self._page)
 
     def get_html(self):
         try:

+ 205 - 18
format_convert/convert_docx.py

@@ -10,7 +10,8 @@ import xml
 import zipfile
 import docx
 from bs4 import BeautifulSoup
-from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
+from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code, \
+    get_table_html
 from format_convert.wrapt_timeout_decorator import timeout
 from format_convert.convert_image import ImageConvert
 from format_convert.convert_need_interface import from_tika_interface
@@ -313,7 +314,7 @@ def read_xml_order(unique_type_dir, document_xml, numbering_xml, document_xml_re
 
 @timeout(50, timeout_exception=TimeoutError)
 def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_rels):
-    def recursion_read_table(table):
+    def recursion_read_table(table, show=0):
         table_text = '<table border="1">'
         tr_index = 0
         tr_text_list = []
@@ -349,6 +350,7 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
                             if is_merge == "continue":
                                 row_span_dict[tc_index][0] += 1
                                 tc_index += col_span
+                                tc_text_list.append([tc_text, col_span])
                                 # 跳过,不增加td
                                 continue
                                 # col_span_index = 0
@@ -403,6 +405,11 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
                 tr_index += 1
                 tr_text_list.append(tc_text_list)
 
+        if show:
+            for row in tr_text_list:
+                print('row', row)
+                print('len(row)', len(row))
+
         # 替换所有row_span
         for key in row_span_dict.keys():
             row_span, finish_row_span_flag = row_span_dict.get(key)
@@ -420,7 +427,8 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
         for node in body_nodes:
             if 'w:tbl' in str(node).split(' '):
                 _table = node
-                _table_text = recursion_read_table(_table)
+                # _table_text = recursion_read_table(_table)
+                _table_text = xml_table_to_html(_table, unique_type_dir, numbering_xml, document_xml_rels)
                 table_text_list.append(_table_text)
         return table_text_list
 
@@ -430,6 +438,146 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
         return [-1]
 
 
+def xml_table_to_html(table, unique_type_dir, numbering_xml, document_xml_rels, show=0):
+    tr_index = 0
+    tr_text_list = []
+    last_node_level = 0
+    num_pr_dict = {}
+
+    # 直接子节点用child表示,所有子节点用all表示
+    for table_child in table.childNodes:
+        if 'w:tr' in str(table_child):
+            tr = table_child
+            tr_child_nodes = tr.childNodes
+            tc_index = 0
+            tc_text_list = []
+            for tr_child in tr_child_nodes:
+                if 'w:tc' in str(tr_child).split(' '):
+                    tc_text = ""
+                    tc = tr_child
+                    # 获取一格占多少列,相当于colspan
+                    col_span = tc.getElementsByTagName("w:gridSpan")
+                    if col_span:
+                        col_span = int(col_span[0].getAttribute("w:val"))
+                    else:
+                        col_span = 1
+                    # 获取是否是合并单元格的下一个空单元格,相当于rowspan
+                    is_merge = tc.getElementsByTagName("w:vMerge")
+                    if is_merge:
+                        is_merge = is_merge[0].getAttribute("w:val")
+                        if is_merge == "continue":
+                            tc_index += col_span
+                            tc_text = '@continue@'
+                            tc_text_list.append([tc_text, col_span])
+                            # 跳过,不增加td
+                            continue
+
+                    # 放入文本
+                    tc_child_nodes = tc.childNodes
+                    for tc_child in tc_child_nodes:
+                        # 处理嵌套在tc中的表格
+                        if 'w:tbl' in str(tc_child).split(' '):
+                            tc_text += xml_table_to_html(tc_child, unique_type_dir, numbering_xml, document_xml_rels)
+                        # 处理编号
+                        if 'w:p' in str(tc_child).split(' '):
+                            _t_list, _, num_pr_dict, last_node_level = read_p_text(unique_type_dir,
+                                                                                   tc_child,
+                                                                                   last_node_level,
+                                                                                   num_pr_dict,
+                                                                                   numbering_xml,
+                                                                                   document_xml_rels)
+                            tc_text += ''.join(_t_list)
+                    # 结束该tc
+                    tc_index += col_span
+                    tc_text_list.append([tc_text, col_span])
+            # 结束该tr
+            tr_index += 1
+            tr_text_list.append(tc_text_list)
+
+    if show:
+        for row in tr_text_list:
+            print('row', row)
+            print('len(row)', len(row))
+
+    table_html = row_list_to_table(tr_text_list)
+    return table_html
+
+
+def row_list_to_table(row_list, show=0):
+    if show:
+        print('='*50)
+
+    # 复制合并列
+    new_row_list = []
+    for row in row_list:
+        new_row = []
+        for col, col_span in row:
+            new_row += [[col, col_span]]
+            if col_span > 1:
+                new_row += [[col, 0]] * (col_span - 1)
+        new_row_list.append(new_row)
+    row_list = new_row_list
+
+    if show:
+        for row in row_list:
+            print('copy row', row)
+
+    # 计算是不是每行都有相等列数
+    row_cnt_list = []
+    for row in row_list:
+        row_cnt_list.append(len(row))
+
+    if len(set(row_cnt_list)) != 1:
+        log('表格有列数不同,直接返回text' + str(row_cnt_list))
+        # 直接返回所有col的text
+        text = ''
+        for row in row_list:
+            for col, col_span in row:
+                text += col
+        return text
+
+    new_row_list = []
+    for ri, row in enumerate(row_list):
+        new_row = []
+        for ci, col in enumerate(row):
+            col, col_span = col
+            row_span = 1
+            # 判断下面行同列有没有需合并的
+            for ri2 in range(ri+1, len(row_list)):
+                col2, col_span2 = row_list[ri2][ci]
+                if col2 == '@continue@':
+                    row_span += 1
+                else:
+                    break
+
+            # 需跳过的列
+            if col == '@continue@' or col_span == 0:
+                delete = 1
+            else:
+                delete = 0
+
+            col_dict = {
+                'text': col,
+                'rowspan': row_span,
+                'columnspan': col_span,
+                'delete': delete,
+            }
+            new_row.append(col_dict)
+        new_row_list.append(new_row)
+
+    if show:
+        for new_row in new_row_list:
+            print('new_row', new_row)
+
+    table_html = get_table_html(new_row_list)
+
+    # soup = BeautifulSoup(table_html, 'lxml')
+    # print(soup.prettify())
+    if show:
+        print('-' * 50)
+    return table_html
+
+
 @timeout(25, timeout_exception=TimeoutError)
 def parse_xml(path):
     # 解析xml
@@ -449,6 +597,7 @@ def parse_xml2(path):
 class DocxConvert:
     def __init__(self, path, unique_type_dir):
         self._doc = _Document(path)
+        self._page = _Page(None, 0)
         self.path = path
         self.unique_type_dir = unique_type_dir
 
@@ -497,8 +646,6 @@ class DocxConvert:
             self._doc.error_code = [-3]
 
     def convert(self):
-        self._page = _Page(None, 0)
-
         # 先判断特殊doc文件,可能是html文本
         is_html_doc = False
         try:
@@ -630,23 +777,62 @@ class DocxConvert:
     def get_doc_object(self):
         return self._doc
 
+    def use_tika(self, _path):
+        # 调用tika提取
+        # html = from_tika_interface(self.path)
+        # if judge_error_code(html):
+        #     self._doc.error_code = html
+        # self.tika_html = html
+        data = from_tika_interface(_path)
+        if judge_error_code(data):
+            self._doc.error_code = data
+            return
+        current_y = 5
+        for di, d in enumerate(data):
+            data_type, value = d
+            bbox = [0, current_y, 20, current_y+10]
+            current_y += 20
+            if data_type == 'text':
+                _sen = _Sentence(value, bbox)
+                _sen.combine = False
+                self._page.add_child(_sen)
+            elif data_type == 'img':
+                with open(value, "rb") as f:
+                    img = f.read()
+                _img = _Image(img, value, bbox)
+                _img.is_from_docx = True
+                self._page.add_child(_img)
+            elif data_type == 'table':
+                _table = _Table(value, bbox)
+                _table.is_html = True
+                self._page.add_child(_table)
+        self._doc.add_child(self._page)
+
     def get_html(self):
         if self._doc.error_code is not None:
             return self._doc.error_code
         try:
+            # raise
             self.convert()
         except:
             traceback.print_exc()
             self._doc.error_code = [-1]
         # log('docx error code ' + str(self._doc.error_code))
         if self._doc.error_code is not None:
-            # 调用tika提取
-            html = from_tika_interface(self.path)
-            if judge_error_code(html):
-                self._doc.error_code = html
-                return self._doc.error_code
-            else:
-                return [html]
+            # # 调用tika提取
+            # html = from_tika_interface(self.path)
+            # if judge_error_code(html):
+            #     self._doc.error_code = html
+            #     return self._doc.error_code
+            # else:
+            #     return [html]
+            try:
+                self.use_tika(self.path)
+                self._doc.error_code = None
+            except:
+                traceback.print_exc()
+                log('docx tika failed too')
+                self._doc.error_code = [-17]
         return self._doc.get_html()
 
 
@@ -791,9 +977,10 @@ class DocxConvertNew:
 
 
 if __name__ == '__main__':
-    c = DocxConvert("C:/Users/Administrator/Downloads/dsdsd.docx", "C:/Users/Administrator/Downloads/1/")
-    print(c.get_html())
-
-    # c = DocxConvertNew()
-    # # c.read_docx(r'C:\Users\Administrator\Desktop\test_doc\error14.docx')
-    # c.read_docx(r'C:/Users/Administrator/Downloads/dsdsd.docx')
+    _p = r'C:/Users/Administrator/Downloads/1723004790329.docx'
+    # _p = "C:/Users/Administrator/Desktop/test_doc/error14.docx"
+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
+    c = DocxConvert(_p, save_dir)
+    _html = c.get_html()
+    with open('../result.html', 'w', encoding='utf-8') as f:
+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + str(_html[0]))

+ 819 - 25
format_convert/convert_image.py

@@ -21,7 +21,7 @@ from format_convert.utils import judge_error_code, add_div, LineTable, get_table
 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface, from_gpu_interface_redis, \
     from_idc_interface, from_isr_interface
 from format_convert.table_correct import get_rotated_image
-from botr.extract_table import get_table
+from botr.extract_table import get_table, get_b_table_by_blank_colon
 
 
 def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
@@ -66,7 +66,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
     def merge_textbox(textbox_list, in_objs):
         delete_obj = []
         threshold = 5
-        textbox_list.sort(key=lambda x:x.bbox[0])
+        textbox_list.sort(key=lambda x: x.bbox[0])
         for k in range(len(textbox_list)):
             tb1 = textbox_list[k]
             if tb1 not in in_objs and tb1 not in delete_obj:
@@ -74,6 +74,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                     tb2 = textbox_list[m]
                     if tb2 in in_objs:
                         continue
+                    # print('tb1 tb2', tb1, tb2)
                     if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
                             and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
                         if tb1.bbox[0] <= tb2.bbox[0]:
@@ -88,9 +89,9 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                 textbox_list.remove(_obj)
         return textbox_list
 
-    def resize_process(_image_np):
+    def resize_process(_image_np, threshold=2048):
+    # def resize_process(_image_np, threshold=1280):
         # 整体分辨率限制
-        threshold = 2048
         if _image_np.shape[0] > threshold or _image_np.shape[1] > threshold:
             h, w = get_best_predict_size2(_image_np, threshold=threshold)
             log("global image resize " + str(_image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
@@ -169,14 +170,24 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
         log("isr total time "+str(time.time()-_isr_time))
         return _image_np
 
-    def ocr_process(_image_np, _threshold=2048):
+    # def ocr_process(_image_np, _threshold=2048):
+    def ocr_process(_image_np, _threshold=1080):
         log("ocr_process image shape " + str(_image_np.shape))
 
+        # 过小直接返回
+        if _image_np.shape[0] <= 10 or _image_np.shape[1] <= 10:
+            return [], []
+        if _image_np.shape[0] < 50 and _image_np.shape[1] / _image_np.shape[0] > 20:
+            return [], []
+        if _image_np.shape[1] < 50 and _image_np.shape[0] / _image_np.shape[1] > 20:
+            return [], []
+
         # ocr图片过大内存溢出,需resize
         # 大图按比例缩小,小图维持不变;若统一拉伸成固定大小如1024会爆显存
         ratio = (1, 1)
         if _image_np.shape[0] > _threshold or _image_np.shape[1] > _threshold:
-            best_h, best_w = get_best_predict_size2(_image_np, _threshold)
+            # best_h, best_w = get_best_predict_size2(_image_np, _threshold)
+            best_h, best_w = get_best_predict_size_by_area(_image_np, _threshold)
             _image_np = pil_resize(_image_np, best_h, best_w)
             log("ocr_process image resize " + str(_image_np.shape))
             ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
@@ -189,7 +200,13 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
         # 调用ocr模型接口
         image_bytes = np2bytes(_image_np)
-        text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
+        result = from_ocr_interface(image_bytes, is_table=1)
+        # print('from_ocr_interface result ', result)
+        if len(result) != 2:
+            return result, result
+
+        text_list, bbox_list = result
+        # text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
         if judge_error_code(text_list):
             return text_list, text_list
 
@@ -264,6 +281,13 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
     def botr_process(_image_np, table_list2, text_list2, box_list2, text_box_list2, obj_in_table_list2,
                      from_pdf=False, pdf_obj_list=[], pdf_layout_size=()):
+
+        temp_list = []
+        for _table2 in table_list2:
+            _table2 = _Table(_table2["table"], _table2["bbox"])
+            temp_list.append(_table2)
+        table_list2 = temp_list
+
         if from_pdf:
             # 交叉验证 ocr结果与pdf obj,暂时使用pdf提取的
             h_ratio = _image_np.shape[0] / pdf_layout_size[1]
@@ -300,14 +324,55 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
             box_list2 = pdf_box_list
             text_box_list2 = pdf_text_box_list
 
-        _text_box_list, _table_list, _obj_in_table_list = get_table(_image_np, table_list2, text_list2, box_list2, text_box_list2)
-
-        # 保存无边框表格文件
-        if _table_list:
+            _b_table_list = []
+            _not_b_table_list = []
+        else:
+            # 无边框新规则,补充添加 2505015
+            # 根据text规律,判断该页是否可能有无边框表格
             try:
-                save_b_table(_image_np, text_box_list2, from_pdf)
+                _b_table_list, _not_b_table_list = get_b_table_by_blank_colon(text_box_list2, table_list2, (
+                0, 0, _image_np.shape[1], _image_np.shape[0]), _image_np)
             except:
-                pass
+                traceback.print_exc()
+                return [-23], [], []
+
+            # print('_b_table_list111', _b_table_list)
+            if _b_table_list:
+                temp_list = []
+                for _b_table in _b_table_list:
+                    _b_table = _Table(_b_table[0], _b_table[1])
+                    # table_list2 += [_b_table]
+                    temp_list.append(_b_table)
+                _b_table_list = temp_list
+            if _not_b_table_list:
+                temp_list = []
+                for _b_table in _not_b_table_list:
+                    _b_table = _Table(_b_table[0], _b_table[1])
+                    temp_list.append(_b_table)
+                _not_b_table_list = temp_list
+
+        ignore_table_list = table_list2 + _b_table_list + _not_b_table_list
+        # yolo检测出的表格,忽略两列的,因为已经补充了两列的新规则 250529
+        _text_box_list, _table_list, _obj_in_table_list = get_table(_image_np, ignore_table_list, text_list2, box_list2, text_box_list2, from_pdf=from_pdf)
+        # print('_table_list', _table_list)
+        # print('_b_table_list222', _b_table_list)
+
+        # 无边框新规则,补充添加 2505015
+        _table_list = [_Table(x.get('table'), x.get('bbox')) for x in _table_list]
+        _table_list += _b_table_list
+        for _b_table in _b_table_list:
+            for _text_box in text_box_list2:
+                if _b_table.bbox[1] <= _text_box.bbox[1] <= _text_box.bbox[3] <= _b_table.bbox[3]:
+                    # print('add _obj_in_table_list 250515', _text_box)
+                    _obj_in_table_list.append(_text_box)
+        # print('_b_table_list233', _table_list)
+
+        # 保存无边框表格文件
+        # if _table_list:
+        #     try:
+        #         save_b_table(_image_np, text_box_list2, from_pdf)
+        #     except:
+        #         pass
 
         # print('_text_box_list', _text_box_list)
         # print('_table_list', _table_list)
@@ -496,7 +561,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
             else:
                 # 根据index拆开图片,重新ocr
                 split_index_list.insert(0, 0)
-                print('split_index_list1', split_index_list)
+                # print('split_index_list1', split_index_list)
                 for _i, index in enumerate(split_index_list):
                     if _i == len(split_index_list) - 1:
                         split_image_np = sub_image_np[:, index:, :]
@@ -602,12 +667,12 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                 # 生成TextBox对象
                 text_box_list = get_text_box_obj(text_list, box_list)
                 # for t in text_box_list:
-                #     print('text_box0', t.get_text())
+                #     print('text_box0', t)
 
                 # 表格生成
                 text_box_list, table_list, obj_in_table_list = table_process(line_list, text_box_list, image_np)
                 # for t in text_box_list:
-                #     print('text_box1', t.get_text())
+                #     print('text_box1', t)
                 # print('table_list', table_list)
                 # for t in obj_in_table_list:
                 #     print('obj_text_box2', t.get_text())
@@ -625,10 +690,20 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                                                                                 pdf_layout_size,
                                                                                 )
                 log('botr process cost: ' + str(time.time()-start_time))
+                if judge_error_code(text_box_list):
+                    return text_box_list
+
+                # print('b_table_list333', b_table_list)
+                obj_in_table_list.update(set(b_obj_in_table_list))
+                # for t in text_box_list:
+                #     print('text_box2', t)
 
                 # 合并非表格的同一行TextBox
                 text_box_list = merge_textbox(text_box_list, obj_in_table_list)
 
+                # for t in text_box_list:
+                #     print('text_box3', t)
+                # print('table_list, b_table_list', table_list, b_table_list)
                 table_textbox_list.append([table_list, b_table_list, obj_in_table_list, text_box_list])
 
             if reverse_flag:
@@ -649,16 +724,21 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
             _add_y = 0
             for table_list, b_table_list, obj_in_table_list, text_box_list in table_textbox_list:
                 obj_list = []
+                # print('obj_in_table_list', obj_in_table_list)
                 for table in table_list:
-                    _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
+                    _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y,
+                                   table["bbox"][2], table["bbox"][3] + _add_y]
                     _table = _Table(table["table"], _table_bbox)
+                    # print('_table.bbo2x', _table.bbox)
                     obj_list.append(_table)
                 for table in b_table_list:
-                    _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
-                    _table = _Table(table["table"], _table_bbox)
-                    obj_list.append(_table)
+                    # _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
+                    # _table = _Table(table["table"], _table_bbox)
+                    # print('table.bbo1x', table.bbox)
+                    obj_list.append(table)
                 for text_box in text_box_list:
                     if text_box not in obj_in_table_list:
+                        # print('text_box',  text_box)
                         text_box.bbox[1] += _add_y
                         obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
 
@@ -707,6 +787,8 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                                                                         pdf_layout_size,
                                                                         )
             log('botr process cost: ' + str(time.time()-start_time))
+            if judge_error_code(text_box_list):
+                return text_box_list
 
             # 合并非表格的同一行TextBox
             text_box_list = merge_textbox(text_box_list, obj_in_table_list)
@@ -715,8 +797,10 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
             obj_list = []
             # print('table_list', table_list)
             for table in table_list:
-                _table = _Table(table["table"], table["bbox"])
-                obj_list.append(_table)
+                # print('type(table)', type(table))
+                # _table = _Table(table["table"], table["bbox"])
+                # print('table.bbox', table.bbox)
+                obj_list.append(table)
             for text_box in text_box_list:
                 if text_box not in obj_in_table_list:
                     obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
@@ -732,6 +816,690 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
         return [-1]
 
 
+# class ImageProcess:
+#     def __init__(self, image_np, image_path, is_from_pdf=False, is_from_docx=False,
+#                  b_table_from_text=False, pdf_obj_list=[], pdf_layout_size=(),
+#                  is_reverse=False):
+#
+#         self.image_np = image_np
+#         self.image_path = image_path
+#         self.is_from_pdf = is_from_pdf
+#         self.is_from_docx = is_from_docx
+#         self.b_table_from_text = b_table_from_text
+#         self.pdf_obj_list = pdf_obj_list
+#         self.pdf_layout_size = pdf_layout_size
+#         self.is_reverse = is_reverse
+#
+#     def merge_textbox(self, textbox_list, in_objs):
+#         delete_obj = []
+#         threshold = 5
+#         textbox_list.sort(key=lambda x:x.bbox[0])
+#         for k in range(len(textbox_list)):
+#             tb1 = textbox_list[k]
+#             if tb1 not in in_objs and tb1 not in delete_obj:
+#                 for m in range(k+1, len(textbox_list)):
+#                     tb2 = textbox_list[m]
+#                     if tb2 in in_objs:
+#                         continue
+#                     if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
+#                             and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
+#                         if tb1.bbox[0] <= tb2.bbox[0]:
+#                             tb1.text = tb1.text + tb2.text
+#                         else:
+#                             tb1.text = tb2.text + tb1.text
+#                         tb1.bbox[0] = min(tb1.bbox[0], tb2.bbox[0])
+#                         tb1.bbox[2] = max(tb1.bbox[2], tb2.bbox[2])
+#                         delete_obj.append(tb2)
+#         for _obj in delete_obj:
+#             if _obj in textbox_list:
+#                 textbox_list.remove(_obj)
+#         return textbox_list
+#
+#     def resize_process(self, _image_np):
+#         # 整体分辨率限制
+#         threshold = 2048
+#         if _image_np.shape[0] > threshold or _image_np.shape[1] > threshold:
+#             h, w = get_best_predict_size2(_image_np, threshold=threshold)
+#             log("global image resize " + str(_image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
+#             _image_np = pil_resize(_image_np, h, w)
+#         return _image_np
+#
+#     def idc_process(self, _image_np, return_angle=False):
+#         # 图片倾斜校正,写入原来的图片路径
+#         # print("image_process", image_path)
+#         # g_r_i = get_rotated_image(_image_np, image_path)
+#         # if judge_error_code(g_r_i):
+#         #     if is_from_docx:
+#         #         return []
+#         #     else:
+#         #         return g_r_i
+#         # _image_np = cv2.imread(image_path)
+#         # if _image_np is None:
+#         #     return []
+#         # return _image_np
+#
+#         # if _image_np is None:
+#         #     return []
+#
+#         # idc模型实现图片倾斜校正
+#         h, w = get_best_predict_size2(_image_np, 1080)
+#         image_resize = pil_resize(_image_np, h, w)
+#         # image_resize_path = image_path.split(".")[0] + "_resize_idc." + image_path.split(".")[-1]
+#         # cv2.imwrite(image_resize_path, image_resize)
+#
+#         # with open(image_resize_path, "rb") as f:
+#         #     image_bytes = f.read()
+#         image_bytes = np2bytes(image_resize)
+#         angle = from_idc_interface(image_bytes)
+#         log('idc_process angle ' + str(angle))
+#         if judge_error_code(angle):
+#             if return_angle:
+#                 if self.is_from_docx:
+#                     return [], []
+#                 else:
+#                     return angle, angle
+#             else:
+#                 if self.is_from_docx:
+#                     return []
+#                 else:
+#                     return angle
+#         # 根据角度旋转
+#         # _image_pil = Image.fromarray(_image_np)
+#         # _image_np = np.array(_image_pil.rotate(angle, expand=1))
+#         _image_np = image_rotate(_image_np, angle)
+#
+#         # 写入
+#         # idc_path = image_path.split(".")[0] + "_idc." + image_path.split(".")[-1]
+#         # cv2.imwrite(idc_path, image_np)
+#         if return_angle:
+#             return _image_np, angle
+#         return _image_np
+#
+#     def isr_process(self, _image_np):
+#         log("isr_process image shape " + str(_image_np.shape))
+#         image_np_copy = copy.deepcopy(_image_np)
+#         # isr模型去除印章
+#         _isr_time = time.time()
+#         if count_red_pixel(_image_np):
+#             # 红色像素达到一定值才过模型
+#             image_bytes = np2bytes(_image_np)
+#             _image_np = from_isr_interface(image_bytes)
+#             if judge_error_code(_image_np):
+#                 if self.is_from_docx:
+#                     return []
+#                 else:
+#                     return _image_np
+#             # [1]代表检测不到印章,直接返回
+#             if isinstance(_image_np, list) and _image_np == [1]:
+#                 log("no seals detected!")
+#                 _image_np = image_np_copy
+#         log("isr total time "+str(time.time()-_isr_time))
+#         return _image_np
+#
+#     def ocr_process(self, _image_np, _threshold=2048):
+#         log("ocr_process image shape " + str(_image_np.shape))
+#
+#         # ocr图片过大内存溢出,需resize
+#         # 大图按比例缩小,小图维持不变;若统一拉伸成固定大小如1024会爆显存
+#         ratio = (1, 1)
+#         h, w = _image_np.shape[:2]
+#         if _image_np.shape[0] > _threshold or _image_np.shape[1] > _threshold:
+#             best_h, best_w = get_best_predict_size2(_image_np, _threshold)
+#             _image_np = pil_resize(_image_np, best_h, best_w)
+#             log("ocr_process image resize " + str(_image_np.shape))
+#             ratio = (h/best_h, w/best_w)
+#
+#         # 大图片ocr加锁,防止爆显存
+#         # if _image_np.shape[0] >= 1024 and _image_np.shape[1] >= 1024:
+#         #     file_lock = True
+#         # else:
+#         #     file_lock = False
+#
+#         # 调用ocr模型接口
+#         image_bytes = np2bytes(_image_np)
+#         text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
+#         if judge_error_code(text_list):
+#             return text_list, text_list
+#
+#         for i in range(len(bbox_list)):
+#             point = bbox_list[i]
+#             bbox_list[i] = [[int(point[0][0]*ratio[0]), int(point[0][1]*ratio[1])],
+#                             [int(point[1][0]*ratio[0]), int(point[1][1]*ratio[1])],
+#                             [int(point[2][0]*ratio[0]), int(point[2][1]*ratio[1])],
+#                             [int(point[3][0]*ratio[0]), int(point[3][1]*ratio[1])]]
+#
+#         # 去除水印字 根据识别是否为矩形框
+#         temp_text_list = []
+#         temp_bbox_list = []
+#         water_mark_dict = {}
+#         for i in range(len(bbox_list)):
+#             bbox = bbox_list[i]
+#             text = text_list[i]
+#             if len(re.findall('[\u4e00-\u9fa5]', text)) == len(text):
+#                 if (abs(bbox[0][1] - bbox[1][1]) <= 2 and abs(bbox[2][1] - bbox[3][1]) <= 2) \
+#                         or (abs(bbox[0][0] - bbox[3][0]) <= 4 and abs(bbox[2][0] - bbox[1][0]) <= 4):
+#                     temp_text_list.append(text)
+#                     temp_bbox_list.append(bbox)
+#                 else:
+#                     if text in water_mark_dict.keys():
+#                         water_mark_dict[text] += [bbox]
+#                     else:
+#                         water_mark_dict[text] = [bbox]
+#             else:
+#                 temp_text_list.append(text)
+#                 temp_bbox_list.append(bbox)
+#
+#         # 数量多的才算水印
+#         for text in water_mark_dict.keys():
+#             bbox_list = water_mark_dict.get(text)
+#             if len(bbox_list) < 3:
+#                 for bbox in bbox_list:
+#                     temp_text_list.append(text)
+#                     temp_bbox_list.append(bbox)
+#
+#         text_list = temp_text_list
+#         bbox_list = temp_bbox_list
+#         return text_list, bbox_list
+#
+#     def otr_process(self, _image_np):
+#         log("otr_process image shape " + str(_image_np.shape))
+#         # otr模型识别表格,需要图片resize成模型所需大小, 写入另一个路径
+#         best_h, best_w = get_best_predict_size(_image_np)
+#         image_resize = pil_resize(_image_np, best_h, best_w)
+#         # image_resize_path = image_path.split(".")[0] + "_resize_otr." + image_path.split(".")[-1]
+#         # cv2.imwrite(image_resize_path, image_resize)
+#
+#         # 调用otr模型接口
+#         # with open(image_resize_path, "rb") as f:
+#         #     image_bytes = f.read()
+#         image_bytes = np2bytes(image_resize)
+#         list_line = from_otr_interface(image_bytes, self.is_from_pdf)
+#         if judge_error_code(list_line):
+#             if self.is_from_docx:
+#                 return []
+#             else:
+#                 return list_line
+#
+#         # otr resize后得到的bbox根据比例还原
+#         start_time = time.time()
+#         ratio = (_image_np.shape[0]/best_h, _image_np.shape[1]/best_w)
+#         for i in range(len(list_line)):
+#             point = list_line[i]
+#             list_line[i] = [int(point[0]*ratio[1]), int(point[1]*ratio[0]),
+#                             int(point[2]*ratio[1]), int(point[3]*ratio[0])]
+#         log("otr resize bbox recover " + str(time.time()-start_time))
+#         return list_line
+#
+#     def botr_process(self, _image_np, table_list2, text_list2, box_list2, text_box_list2, obj_in_table_list2,
+#                      from_pdf=False, pdf_obj_list=[], pdf_layout_size=()):
+#         if from_pdf:
+#             # 交叉验证 ocr结果与pdf obj,暂时使用pdf提取的
+#             h_ratio = _image_np.shape[0] / pdf_layout_size[1]
+#             w_ratio = _image_np.shape[1] / pdf_layout_size[0]
+#             pdf_text_list = []
+#             pdf_box_list = []
+#             for obj in pdf_obj_list:
+#                 if obj.get_text() in ["", " "]:
+#                     continue
+#
+#                 # pdf坐标是上下颠倒的
+#                 # obj.bbox = (obj.bbox[0], pdf_layout_size[1]-obj.bbox[3],
+#                 #             obj.bbox[2], pdf_layout_size[1]-obj.bbox[1])
+#
+#                 # 根据两个页面大小比例调整坐标
+#                 obj.bbox = (obj.bbox[0]*w_ratio, obj.bbox[1]*h_ratio,
+#                             obj.bbox[2]*w_ratio, obj.bbox[3]*h_ratio)
+#
+#                 # 剔除水印字
+#                 text = re.sub('[\n ]', '', obj.get_text())
+#                 if len(text) == 1 and abs(obj.bbox[0] - obj.bbox[2]) >= 70:
+#                     continue
+#
+#                 pdf_box_list.append([[int(obj.bbox[0]), int(obj.bbox[1])],
+#                                      [],
+#                                      [int(obj.bbox[2]), int(obj.bbox[3])],
+#                                      []
+#                                      ])
+#                 pdf_text_list.append(re.sub('[\n]', '', obj.get_text()))
+#
+#             pdf_text_box_list = self.get_text_box_obj(pdf_text_list, pdf_box_list)
+#
+#             text_list2 = pdf_text_list
+#             box_list2 = pdf_box_list
+#             text_box_list2 = pdf_text_box_list
+#
+#         _text_box_list, _table_list, _obj_in_table_list = get_table(_image_np, table_list2, text_list2, box_list2, text_box_list2, from_pdf=from_pdf)
+#
+#         # 保存无边框表格文件
+#         if _table_list:
+#             try:
+#                 self.save_b_table(_image_np, text_box_list2, from_pdf)
+#             except:
+#                 pass
+#
+#         # print('_text_box_list', _text_box_list)
+#         # print('_table_list', _table_list)
+#         if from_pdf:
+#             text_box_list2 = []
+#             table_list2 = []
+#
+#         if _table_list and _text_box_list:
+#             text_box_list2 += _text_box_list
+#             text_box_list2 = list(set(text_box_list2))
+#             # table_list2 += _table_list
+#             # obj_in_table_list2 = obj_in_table_list2.union(_obj_in_table_list)
+#         return text_box_list2, _table_list, _obj_in_table_list
+#
+#     def table_process(self, list_line, list_text_boxes, _image_np):
+#         # 调用现成方法形成表格
+#         try:
+#             if list_line:
+#
+#                 # 排除掉短且经过文字bbox中间的竖线
+#                 temp_list = []
+#                 for line in list_line:
+#                     find_cnt = 0
+#                     if abs(line[0]-line[2]) < abs(line[1]-line[3]) and abs(line[1] - line[3]) <= _image_np.shape[0] / 20:
+#                         for t_obj in list_text_boxes:
+#                             # if not (t_obj.bbox[1] <= line[1] <= t_obj.bbox[3] or t_obj.bbox[1] <= line[3] <= t_obj.bbox[3]):
+#                             #     continue
+#                             if line_iou([[t_obj.bbox[1], 0], [t_obj.bbox[3], 0]], [[line[1], 0], [line[3], 0]]) < 0.3:
+#                                 continue
+#                             if abs(t_obj.bbox[0]-t_obj.bbox[2])/5 + min(t_obj.bbox[0], t_obj.bbox[2]) <= line[0] <= abs(t_obj.bbox[0]-t_obj.bbox[2])/5*4 + min(t_obj.bbox[0], t_obj.bbox[2]) and (t_obj.bbox[0]-t_obj.bbox[2]) <= 60:
+#                                 # print('match', line[0], t_obj.bbox[0], t_obj.bbox[2], t_obj.get_text())
+#                                 find_cnt += 1
+#                                 if find_cnt >= 2:
+#                                     break
+#                     if find_cnt >= 2:
+#                         continue
+#                     temp_list.append(line)
+#                 list_line = temp_list
+#
+#                 from format_convert.convert_tree import TableLine
+#                 list_lines = []
+#                 for line in list_line:
+#                     list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
+#
+#                 lt = LineTable()
+#                 tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
+#                                                                                    sourceP_LB=False, splited=False,
+#                                                                                    from_pdf=self.is_from_pdf,
+#                                                                                    is_reverse=self.is_reverse)
+#                 # 需分割textbox
+#                 if connect_textbox_list:
+#                     list_text_boxes = self.table_textbox_split(_image_np, connect_textbox_list, list_text_boxes)
+#                     # 新的textbox,重新做表格
+#                     tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
+#                                                                                        sourceP_LB=False, splited=True,
+#                                                                                        from_pdf=self.is_from_pdf,
+#                                                                                        is_reverse=self.is_reverse)
+#
+#                 if not tables:
+#                     return list_text_boxes, tables, obj_in_table
+#                 return list_text_boxes, tables, obj_in_table
+#             else:
+#                 return list_text_boxes, [], set()
+#         except:
+#             traceback.print_exc()
+#             return [-8], [-8], [-8]
+#
+#     def slice_process(self, _image_np):
+#         slice_flag = need_image_slice(_image_np)
+#         log("need_image_slice " + str(slice_flag) + " " + str(_image_np.shape))
+#         _image_np_list = [_image_np]
+#         if slice_flag:
+#             # 长图分割
+#             _image_np_list = image_slice_new(_image_np)
+#             angle_dict = {}
+#             for im in _image_np_list:
+#                 _, angle = self.idc_process(im, return_angle=True)
+#                 if angle in [0, 360]:
+#                     angle = 0
+#                 if angle in angle_dict.keys():
+#                     angle_dict[angle] += 1
+#                 else:
+#                     angle_dict[angle] = 1
+#
+#             # idc不太准,有0度就直接使用
+#             if 0 in angle_dict.keys():
+#                 log('image_slice 0 in angle_dict')
+#                 angle = 0
+#             else:
+#                 angle_list = [[key, value] for key, value in angle_dict.items()]
+#                 angle_list.sort(key=lambda x: x[1])
+#                 log('image_slice angle_list ' + str(angle_list))
+#                 angle = angle_list[-1][0]
+#             for i in range(len(_image_np_list)):
+#                 _image_np_list[i] = image_rotate(_image_np_list[i], angle)
+#             if angle in [180]:
+#                 _image_np_list.reverse()
+#
+#         if len(_image_np_list) < 1:
+#             log("image_slice failed!")
+#             _image_np_list = [_image_np]
+#         return _image_np_list
+#
+#     def get_text_box_obj(self, _text_list, _bbox_list):
+#         from format_convert.convert_tree import TextBox
+#         _text_box_list = []
+#         for i in range(len(_bbox_list)):
+#             bbox = _bbox_list[i]
+#             b_text = _text_list[i]
+#             _text_box_list.append(TextBox([bbox[0][0], bbox[0][1],
+#                                            bbox[2][0], bbox[2][1]], b_text))
+#         return _text_box_list
+#
+#     def save_b_table(self, image_np2, text_box_list2, from_pdf=False):
+#         _start_time = time.time()
+#         _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table'
+#         # _path = 'D:/Project/format_conversion_maxcompute/save_b_table'
+#         max_index = 20000
+#         if os.path.exists(_path):
+#             file_list = glob(_path + '/*')
+#             if file_list:
+#                 file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
+#                 file_index_list.sort(key=lambda x: x)
+#                 index = file_index_list[-1] + 1
+#             else:
+#                 index = 0
+#             if index > max_index:
+#                 return
+#
+#             # 文件md5
+#             from format_convert import _global
+#             _md5 = _global.get("md5")
+#
+#             _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png'
+#             cv2.imwrite(_image_path, image_np2)
+#             log('save b_table image success!')
+#
+#             # if from_pdf:
+#             #     _file_path = _path + '/' + str(_md5) + '-' + str(index) + '.txt'
+#             #     new_text_box_list2 = [str(x) + '\n' for x in text_box_list2]
+#             #     with open(_file_path, 'w') as f:
+#             #         f.writelines(new_text_box_list2)
+#             #     log('save b_table txt success!')
+#
+#         log('save_b_table cost: ' + str(time.time()-_start_time))
+#
+#     def table_textbox_split(self, image_np2, connect_textbox_list, textbox_list):
+#         """
+#         两个单元格里的文本被ocr识别为一个,需分开才能准确放进表格
+#
+#         :return:
+#         """
+#         split_bbox_list = []
+#         split_text_list = []
+#         splited_textbox_list = []
+#         for textbox in connect_textbox_list:
+#             bbox = textbox.bbox
+#             bbox = [[bbox[0], bbox[1]], [], [bbox[2], bbox[3]], []]
+#             sub_image_np = image_np2[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
+#             split_index_list = []
+#             # 从左到右遍历img
+#             for i in range(5, sub_image_np.shape[1]-5):
+#                 # 找表格分割线,这一列都为黑色像素
+#                 if np.where(sub_image_np[:, i, 0] < 200)[0].size >= sub_image_np.shape[0]:
+#                     split_index_list.append(i)
+#
+#             # 判断两线之间宽度,去重
+#             if len(split_index_list) > 1:
+#                 last_index = split_index_list[0]
+#                 temp_list = []
+#                 delete_list = []
+#                 for index in split_index_list[1:]:
+#                     if index in delete_list:
+#                         continue
+#                     if index - last_index <= 5:
+#                         delete_list.append(index)
+#                     else:
+#                         last_index = index
+#                     temp_list.append(last_index)
+#                 split_index_list = temp_list
+#
+#             # n条以上分割线,有问题
+#             if len(split_index_list) == 0 or len(split_index_list) >= 2:
+#                 # print('len(split_index_list)', len(split_index_list), split_index_list)
+#                 continue
+#             else:
+#                 # 根据index拆开图片,重新ocr
+#                 split_index_list.insert(0, 0)
+#                 print('split_index_list1', split_index_list)
+#                 for _i, index in enumerate(split_index_list):
+#                     if _i == len(split_index_list) - 1:
+#                         split_image_np = sub_image_np[:, index:, :]
+#                         split_bbox_list.append([[bbox[0][0]+index, bbox[0][1]], [], [bbox[2][0], bbox[2][1]], []])
+#                     else:
+#                         next_index = split_index_list[_i+1]
+#                         split_image_np = sub_image_np[:, index:next_index, :]
+#                         split_bbox_list.append([[bbox[0][0]+index, bbox[0][1]], [], [bbox[0][0]+next_index, bbox[2][1]], []])
+#
+#                     # ocr
+#                     split_image_bytes = np2bytes(split_image_np)
+#                     text_list2, bbox_list2 = from_ocr_interface(split_image_bytes, is_table=1, only_rec=1)
+#                     # print('text_list2', text_list2)
+#                     # print('bbox_list2', split_bbox_list)
+#                     if judge_error_code(text_list2):
+#                         text2 = ''
+#                     else:
+#                         if text_list2:
+#                             text2 = text_list2[0]
+#                         else:
+#                             text2 = ''
+#                     split_text_list.append(text2)
+#                 splited_textbox_list.append(textbox)
+#
+#         if split_text_list and split_bbox_list:
+#             split_textbox_list = self.get_text_box_obj(split_text_list, split_bbox_list)
+#             for tb in splited_textbox_list:
+#                 if tb in textbox_list:
+#                     textbox_list.remove(tb)
+#             textbox_list += split_textbox_list
+#
+#         return textbox_list
+#
+#     def __call__(self):
+#         from format_convert.convert_tree import _Table, _Sentence
+#         log("into image_preprocess")
+#         try:
+#             if self.image_np is None:
+#                 log("image_preprocess image_np is None")
+#                 return []
+#             if self.image_np.shape[0] <= 20 or self.image_np.shape[1] <= 20:
+#                 log('image_np.shape[0] <= 20 or image_np.shape[1] <= 20')
+#                 return []
+#
+#             if not self.b_table_from_text:
+#                 # 判断是否需要长图分割
+#                 idc_flag = False
+#                 image_np_list = self.slice_process(self.image_np)
+#                 if len(image_np_list) > 1:
+#                     idc_flag = True
+#
+#                 reverse_flag = 0
+#                 table_textbox_list = []
+#                 for image_np in image_np_list:
+#                     # 整体分辨率限制
+#                     image_np = self.resize_process(image_np)
+#
+#                     # 印章去除
+#                     image_np = self.isr_process(image_np)
+#                     if isinstance(image_np, list):
+#                         return image_np
+#
+#                     # 文字识别
+#                     text_list, box_list = self.ocr_process(image_np)
+#                     if judge_error_code(text_list):
+#                         return text_list
+#
+#                     # 判断ocr识别是否正确
+#                     # print('ocr_cant_read(text_list, box_list)', ocr_cant_read(text_list, box_list), idc_flag, text_list)
+#                     if ocr_cant_read(text_list, box_list) and not idc_flag:
+#                         # 方向分类
+#                         image_np, angle = self.idc_process(image_np, return_angle=True)
+#                         if isinstance(image_np, list):
+#                             return image_np
+#                         # 如果角度不变,旋转180
+#                         if angle in [0, 360]:
+#                             pass
+#                             # log('ocr_cant_read image_rotate 180')
+#                             # image_np = image_rotate(image_np, angle=180)
+#                             # reverse_flag = 1
+#                             # image_pil = Image.fromarray(image_np)
+#                             # image_np = np.array(image_pil.rotate(180, expand=1))
+#                         # cv2.imshow("idc_process", image_np)
+#                         # cv2.waitKey(0)
+#
+#                         # 文字识别
+#                         text_list1, box_list_1 = self.ocr_process(image_np)
+#                         if judge_error_code(text_list1):
+#                             return text_list1
+#
+#                         if len(text_list1) > 0 and ocr_cant_read(text_list1, box_list_1) and self.is_from_pdf:
+#                             return [-16]
+#
+#                         # 比较字数
+#                         # print("ocr process", len("".join(text_list)), len("".join(text_list1)))
+#                         if len("".join(text_list)) < len("".join(text_list1)):
+#                             text_list = text_list1
+#                             box_list = box_list_1
+#
+#                     # 表格识别
+#                     line_list = self.otr_process(image_np)
+#                     if judge_error_code(line_list):
+#                         return line_list
+#
+#                     # 生成TextBox对象
+#                     text_box_list = self.get_text_box_obj(text_list, box_list)
+#                     # for t in text_box_list:
+#                     #     print('text_box0', t.get_text())
+#
+#                     # 表格生成
+#                     text_box_list, table_list, obj_in_table_list = self.table_process(line_list, text_box_list, image_np)
+#                     # for t in text_box_list:
+#                     #     print('text_box1', t.get_text())
+#                     # print('table_list', table_list)
+#                     # for t in obj_in_table_list:
+#                     #     print('obj_text_box2', t.get_text())
+#                     if judge_error_code(table_list):
+#                         return table_list
+#
+#                     # 无边框表格识别
+#                     start_time = time.time()
+#                     text_box_list, b_table_list, b_obj_in_table_list \
+#                         = self.botr_process(image_np, table_list, text_list, box_list,
+#                                             text_box_list, obj_in_table_list, self.b_table_from_text,
+#                                             self.pdf_obj_list, self.pdf_layout_size,
+#                                             )
+#                     log('botr process cost: ' + str(time.time()-start_time))
+#
+#                     # 合并非表格的同一行TextBox
+#                     text_box_list = self.merge_textbox(text_box_list, obj_in_table_list)
+#
+#                     table_textbox_list.append([table_list, b_table_list, obj_in_table_list, text_box_list])
+#
+#                 if reverse_flag:
+#                     table_textbox_list.reverse()
+#
+#                     for i in range(len(image_np_list)):
+#                         image_np_list[i] = image_rotate(image_np_list[i], angle=180)
+#                     image_np_list.reverse()
+#
+#                 # index = 0
+#                 # for image_np in image_np_list:
+#                 #     cv2.imshow(str(index) + '.jpg', image_np)
+#                 #     cv2.waitKey(0)
+#                 #     index += 1
+#
+#                 # 对象生成
+#                 all_obj_list = []
+#                 _add_y = 0
+#                 for table_list, b_table_list, obj_in_table_list, text_box_list in table_textbox_list:
+#                     obj_list = []
+#                     for table in table_list:
+#                         _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
+#                         _table = _Table(table["table"], _table_bbox)
+#                         obj_list.append(_table)
+#                     for table in b_table_list:
+#                         _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
+#                         _table = _Table(table["table"], _table_bbox)
+#                         obj_list.append(_table)
+#                     for text_box in text_box_list:
+#                         if text_box not in obj_in_table_list:
+#                             text_box.bbox[1] += _add_y
+#                             obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
+#
+#                     # 多图修正y
+#                     if len(image_np_list) > 1:
+#                         list_y = []
+#                         for obj in obj_list:
+#                             obj.y += _add_y
+#                             list_y.append(obj.y)
+#                         if len(list_y) > 0:
+#                             _add_y += max(list_y)
+#
+#                     # 合并
+#                     all_obj_list += obj_list
+#
+#             # 无边框表格图片
+#             else:
+#                 all_obj_list = []
+#                 table_list = []
+#                 text_list = []
+#                 box_list = []
+#                 text_box_list = []
+#                 obj_in_table_list = set()
+#
+#                 # 表格识别
+#                 line_list = self.otr_process(self.image_np)
+#                 if judge_error_code(line_list):
+#                     return line_list
+#
+#                 # 生成TextBox对象
+#                 text_box_list = self.get_text_box_obj(text_list, box_list)
+#
+#                 # 表格生成
+#                 text_box_list, table_list, obj_in_table_list = self.table_process(line_list, text_box_list, self.image_np)
+#                 if judge_error_code(table_list):
+#                     return table_list
+#
+#                 # 无边框表格识别
+#                 start_time = time.time()
+#                 text_box_list, table_list, obj_in_table_list \
+#                     = self.botr_process(self.image_np, table_list,
+#                                         text_list, box_list,
+#                                                                             text_box_list,
+#                                                                             obj_in_table_list,
+#                                         self.b_table_from_text,
+#                                         self.pdf_obj_list,
+#                                         self.pdf_layout_size,
+#                                                                             )
+#                 log('botr process cost: ' + str(time.time()-start_time))
+#
+#                 # 合并非表格的同一行TextBox
+#                 text_box_list = self.merge_textbox(text_box_list, obj_in_table_list)
+#
+#                 # 对象生成
+#                 obj_list = []
+#                 # print('table_list', table_list)
+#                 for table in table_list:
+#                     _table = _Table(table["table"], table["bbox"])
+#                     obj_list.append(_table)
+#                 for text_box in text_box_list:
+#                     if text_box not in obj_in_table_list:
+#                         obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
+#
+#                 # 合并
+#                 all_obj_list += obj_list
+#
+#             return all_obj_list
+#
+#         except Exception as e:
+#             log("image_preprocess error")
+#             traceback.print_exc()
+#             return [-1]
+
+
 @memory_decorator
 def picture2text(path, html=False):
     log("into picture2text")
@@ -786,6 +1554,21 @@ def get_best_predict_size2(image_np, threshold=3000):
     return h, w
 
 
+def get_best_predict_size_by_area(image_np, threshold=1280):
+    max_area = threshold*threshold
+    height, width = image_np.shape[:2]
+    area = height * width
+
+    if area <= max_area:
+        return height, width
+
+    # 计算缩放比例
+    scale = (max_area / area) ** 0.5
+    new_width = int(width * scale)
+    new_height = int(height * scale)
+    return new_height, new_width
+
+
 def image_slice(image_np):
     """
     slice the image if the height is to large
@@ -1269,6 +2052,17 @@ def image_process_old(image_np, image_path, is_from_pdf=False, is_from_docx=Fals
 
 
 if __name__ == "__main__":
-    img111 = cv2.imread("C:/Users/Administrator/Downloads/1724146601927.png")
-    cv2.imshow('111', img111)
-    cv2.waitKey(0)
+    # _pp = r'D:\Project\format_conversion_maxcompute\save_b_table' \
+    #       r'\211-6591070e1cc8ea6904ba00a0a3d6c32f.png'
+    _pp = r'C:\Users\Administrator\Desktop\test_b_table\error7.png'
+    save_pp = r'D:\Project\format_conversion_maxcompute\format_convert\temp\test_convert_image.jpg'
+    # img111 = cv2.imread(_pp)
+    # img111 = pil_resize(img111, 1024, 768)
+    # cv2.imwrite(save_pp, img111)
+    # image_process(img111, '')
+    # cv2.imshow('111', img111)
+    # cv2.waitKey(0)
+
+    _html = ImageConvert(_pp, r"D:\Project\format_conversion_maxcompute\format_convert\temp").get_html()
+    with open('../result.html', 'w', encoding='utf-8') as f:
+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])

+ 26 - 9
format_convert/convert_need_interface.py

@@ -144,6 +144,7 @@ def from_office_interface_240606(src_path, dest_path, target_format, retry_times
 
 
 def from_office_interface(src_path, dest_path, target_format, retry_times=1, from_remote=FROM_REMOTE):
+    start_time = time.time()
     try:
         if from_remote:
             # 重试
@@ -200,6 +201,8 @@ def from_office_interface(src_path, dest_path, target_format, retry_times=1, fro
         log("from_office_interface error!")
         traceback.print_exc()
         return [-1]
+    finally:
+        log("from_office_interface cost time " + str(time.time()-start_time))
 
 
 def from_tika_interface(src_path, from_remote=FROM_REMOTE):
@@ -239,17 +242,21 @@ def from_tika_interface(src_path, from_remote=FROM_REMOTE):
             return [-2]
 
         _dict = r
-        html = _dict.get("html")
-        log("from_tika_interface cost time " + str(time.time()-start_time))
-        return html
+        data = _dict.get("data")
+
+        return data
     except Exception as e:
         log("from_tika_interface error!")
         traceback.print_exc()
         return [-11]
+    finally:
+        log("from_tika_interface cost time " + str(time.time()-start_time))
 
 
 def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_REMOTE):
     log("into from_ocr_interface")
+    # print('FROM_REMOTE', FROM_REMOTE)
+    start_time = time.time()
     try:
         base64_stream = base64.b64encode(image_stream)
 
@@ -281,7 +288,10 @@ def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_RE
                             log("retry post ocr_interface... left times " + str(retry_times_1))
                             continue
                     if judge_error_code(r):
-                        return r
+                        if is_table:
+                            return r, r
+                        else:
+                            return r
                     break
             else:
                 if globals().get("global_ocr_model") is None:
@@ -326,6 +336,8 @@ def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_RE
             return [-1], [-1]
         else:
             return [-1]
+    finally:
+        log("from_ocr_interface cost time " + str(time.time()-start_time))
 
 
 def from_gpu_interface_redis(_dict, model_type, predictor_type):
@@ -366,6 +378,7 @@ def from_gpu_interface_redis(_dict, model_type, predictor_type):
 
 def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE):
     log("into from_otr_interface")
+    start_time = time.time()
     try:
         base64_stream = base64.b64encode(image_stream)
 
@@ -424,6 +437,8 @@ def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE)
         log("from_otr_interface error!")
         print("from_otr_interface", traceback.print_exc())
         return [-1]
+    finally:
+        log("from_otr_interface cost time " + str(time.time()-start_time))
 
 
 def from_isr_interface(image_stream, from_remote=FROM_REMOTE):
@@ -487,7 +502,6 @@ def from_isr_interface(image_stream, from_remote=FROM_REMOTE):
             image_np = cv2.imdecode(buffer, 1)
         else:
             image_np = _dict.get("image")
-        log("from_isr_interface cost time " + str(time.time()-start_time))
         return image_np
     except Exception as e:
         log("from_isr_interface error!")
@@ -495,7 +509,7 @@ def from_isr_interface(image_stream, from_remote=FROM_REMOTE):
         return [-11]
     finally:
         # os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
-        pass
+        log("from_isr_interface cost time " + str(time.time()-start_time))
 
 
 def from_idc_interface(image_stream, from_remote=FROM_REMOTE):
@@ -543,12 +557,13 @@ def from_idc_interface(image_stream, from_remote=FROM_REMOTE):
 
         _dict = r
         angle = _dict.get("angle")
-        log("from_idc_interface cost time " + str(time.time()-start_time))
         return angle
     except Exception as e:
         log("from_idc_interface error!")
         traceback.print_exc()
         return [-11]
+    finally:
+        log("from_idc_interface cost time " + str(time.time()-start_time))
 
 
 def from_atc_interface(text, from_remote=FROM_REMOTE):
@@ -594,12 +609,13 @@ def from_atc_interface(text, from_remote=FROM_REMOTE):
 
         _dict = r
         classification = _dict.get("classification")
-        log("from_atc_interface cost time " + str(time.time()-start_time))
         return classification
     except Exception as e:
         log("from_atc_interface error!")
         traceback.print_exc()
         return [-11]
+    finally:
+        log("from_atc_interface cost time " + str(time.time()-start_time))
 
 
 def from_yolo_interface(image_stream, from_remote=FROM_REMOTE):
@@ -652,12 +668,13 @@ def from_yolo_interface(image_stream, from_remote=FROM_REMOTE):
 
         _dict = r
         b_table_list = _dict.get("b_table_list")
-        log("from_yolo_interface cost time " + str(time.time()-start_time))
         return b_table_list
     except Exception as e:
         log("from_yolo_interface error!")
         traceback.print_exc()
         return [-11]
+    finally:
+        log("from_yolo_interface cost time " + str(time.time()-start_time))
 
 
 def interface_pool_gunicorn(interface_type):

+ 75 - 0
format_convert/convert_ofd.py

@@ -0,0 +1,75 @@
+import base64
+import os
+import re
+import sys
+import time
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
+from format_convert.easyofd.easyofd.ofd import OFD
+from format_convert.convert_tree import _Document, _Sentence, _Page
+import logging
+import traceback
+from format_convert.convert_pdf import PDFConvert
+from format_convert.utils import judge_error_code, get_logger, log
+
+
+class OfdConvert:
+    def __init__(self, path, unique_type_dir):
+        self._doc = _Document(path)
+        self.path = path
+        self.unique_type_dir = unique_type_dir
+        self.ofd = OFD()  # 初始化OFD 工具类
+
+    def convert(self):
+        start_time = time.time()
+        file_prefix = os.path.splitext(os.path.split(self.path)[1])[0]
+
+        with open(self.path, "rb") as f:
+            ofd_b64 = str(base64.b64encode(f.read()), "utf-8")
+
+        self.ofd.read(ofd_b64, save_xml=False, xml_name=f"{file_prefix}_xml",
+                      save_dir=self.unique_type_dir)  # 读取ofdb64
+        # print("ofd.data", ofd.data) # ofd.data 为程序解析结果
+        pdf_bytes, page_need_to_image_dict = self.ofd.to_pdf(return_need_convert_as_image=True)  # 转pdf
+        log('ofd to pdf cost: ' + str(time.time()-start_time))
+        # print('page_need_to_image_dict', page_need_to_image_dict)
+
+        self.ofd.del_data()
+
+        file_name = re.split('[/\\\]', self.path)[-1]
+        new_path = self.unique_type_dir + file_name[:-4] + '.pdf'
+
+        with open(new_path, "wb") as f:
+            f.write(pdf_bytes)
+        log('odf to pdf path ' + new_path + ' cost: ' + str(time.time()-start_time))
+
+        # 用pdf提取
+        self._pdf = PDFConvert(new_path, self.unique_type_dir, need_page_no=None,
+                               page_need_to_image_dict=page_need_to_image_dict)
+        # self._pdf.convert()
+        # self._doc = self._pdf._doc
+
+    def get_html(self):
+        try:
+            self.convert()
+        except:
+            traceback.print_exc()
+            self._doc.error_code = [-1]
+
+        # 直接返回pdf处理的html
+        if self._doc.error_code is not None:
+            return self._doc.error_code
+        else:
+            return self._pdf.get_html()
+
+
+if __name__ == '__main__':
+    _p = "C:/Users/Administrator/Downloads/0c71fe77-f052-414d-8189-3e8cb4f2a607.ofd"
+    p = '../1750060386706.ofd'
+    # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp\2" + '/'
+    c = OfdConvert(_p, save_dir)
+    _html = c.get_html()
+    with open('../result.html', 'w', encoding='utf-8') as f:
+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])
+
+

+ 75 - 0
format_convert/convert_ofd_test.py

@@ -0,0 +1,75 @@
+import base64
+import os
+import re
+import sys
+import time
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
+
+from format_convert.utils import judge_error_code, get_logger, log, register_all_fonts
+# register_all_fonts("/usr/share/fonts/")
+
+from format_convert.easyofd.easyofd.ofd import OFD
+from format_convert.convert_tree import _Document, _Sentence, _Page
+import logging
+import traceback
+from format_convert.convert_pdf import PDFConvert
+
+
+class OfdConvert:
+    def __init__(self, path, unique_type_dir):
+        self._doc = _Document(path)
+        self.path = path
+        self.unique_type_dir = unique_type_dir
+        self.ofd = OFD()  # 初始化OFD 工具类
+
+    def convert(self):
+        start_time = time.time()
+        file_prefix = os.path.splitext(os.path.split(self.path)[1])[0]
+
+        with open(self.path, "rb") as f:
+            ofd_b64 = str(base64.b64encode(f.read()), "utf-8")
+
+        self.ofd.read(ofd_b64, save_xml=False, xml_name=f"{file_prefix}_xml",
+                      save_dir=self.unique_type_dir)  # 读取ofdb64
+        # print("ofd.data", ofd.data) # ofd.data 为程序解析结果
+        pdf_bytes = self.ofd.to_pdf()  # 转pdf
+
+        self.ofd.del_data()
+
+        file_name = re.split('[/\\\]', self.path)[-1]
+        new_path = self.unique_type_dir + file_name[:-4] + '.pdf'
+
+        with open(new_path, "wb") as f:
+            f.write(pdf_bytes)
+        log('odf to pdf path ' + new_path + ' cost: ' + str(time.time()-start_time))
+
+        # 用pdf提取
+        self._pdf = PDFConvert(new_path, self.unique_type_dir, need_page_no=None)
+        # _pdf.convert()
+        # self._doc = _pdf._doc
+
+    def get_html(self):
+        try:
+            self.convert()
+        except:
+            traceback.print_exc()
+            self._doc.error_code = [-1]
+
+        # 直接返回doc处理的html
+        if self._doc.error_code is not None:
+            return self._doc.error_code
+        else:
+            return self._pdf.get_html()
+
+
+if __name__ == '__main__':
+    _p = "C:/Users/Administrator/Downloads/0c71fe77-f052-414d-8189-3e8cb4f2a607.ofd"
+    _p = '../1750381792388.ofd'
+    # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
+    save_dir = "/data/fangjiasheng/format_conversion_maxcompute/format_convert/temp" + '/'
+    c = OfdConvert(_p, save_dir)
+    _html = c.get_html()
+    print(_html)
+
+

+ 352 - 51
format_convert/convert_pdf.py

@@ -1,3 +1,6 @@
+import shutil
+import zlib
+from glob import glob
 import copy
 import io
 import os
@@ -23,10 +26,12 @@ from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
     LTTextBoxVertical, LTLine, LTTextContainer, LTTextLine
 from format_convert.utils import judge_error_code, get_platform, LineTable, log, \
-    memory_decorator, get_garble_code, get_md5_from_bytes, bytes2np, bbox_iou, get_garble_code2, get_traditional_chinese
+    memory_decorator, get_garble_code, get_md5_from_bytes, bytes2np, bbox_iou, get_garble_code2, \
+    get_traditional_chinese, ascii85_decode
 import fitz
 from format_convert.wrapt_timeout_decorator import timeout
 from otr.table_line_pdf import table_line_pdf
+from botr.extract_table import get_b_table_by_blank_colon
 
 
 @memory_decorator
@@ -38,6 +43,7 @@ def pdf2text(path, unique_type_dir):
 def pdf_analyze(interpreter, page, device, page_no):
     pdf_time = time.time()
     interpreter.process_page(page)
+    # print('interpreter.process_page time', time.time()-pdf_time)
     layout = device.get_result()
     log("page_no: " + str(page_no) + " pdf_analyze cost: " + str(time.time() - pdf_time))
     return layout
@@ -76,7 +82,7 @@ def read_pdfplumber(path, laparams):
 
 
 class PDFConvert:
-    def __init__(self, path, unique_type_dir, need_page_no):
+    def __init__(self, path, unique_type_dir, need_page_no, page_need_to_image_dict=None):
         self._doc = _Document(path)
         self.path = path
         self.unique_type_dir = unique_type_dir
@@ -89,7 +95,7 @@ class PDFConvert:
         self.end_page_no = None
         # 默认使用limit_page_cnt控制,前10页后10页
         if self.need_page_no is None:
-            self.limit_page_cnt = 20
+            self.limit_page_cnt = 50
         else:
             # 使用start_page_no,end_page_no范围控制,例如2,5
             ss = self.need_page_no.split(',')
@@ -120,6 +126,12 @@ class PDFConvert:
         # 初始化_page
         self._page = _Page(None, 0)
 
+        # 需要直接转成image来识别的页面
+        if type(page_need_to_image_dict) is not dict:
+            self.page_need_to_image_dict = {}
+        else:
+            self.page_need_to_image_dict = page_need_to_image_dict
+
     @memory_decorator
     def init_package(self, package_name):
         # 各个包初始化
@@ -128,7 +140,9 @@ class PDFConvert:
                                 char_margin=0.3,
                                 line_margin=0.01,
                                 word_margin=0.01,
-                                boxes_flow=0.1, )
+                                # boxes_flow=0.1,
+                                boxes_flow=None,
+                                )
             if package_name == self.packages[0]:
                 self.doc_pdfminer, self.device, self.interpreter = read_pdfminer(self.path, laparams)
                 self.has_init_pdf[0] = 1
@@ -153,7 +167,7 @@ class PDFConvert:
             self._doc.error_code = [-3]
 
     @memory_decorator
-    def convert(self, limit_page_cnt=20):
+    def convert(self, limit_page_cnt=50):
         if self.has_init_pdf[0] == 0:
             self.init_package("pdfminer")
         if self._doc.error_code is not None:
@@ -201,8 +215,11 @@ class PDFConvert:
                     continue
             # 限制pdf页数,只取前后各10页
             else:
-                if page_count > limit_page_cnt and int(limit_page_cnt / 2) <= page_no < page_count - int(
-                        limit_page_cnt / 2):
+                # if page_count > limit_page_cnt and int(limit_page_cnt / 2) <= page_no < page_count - int(
+                #         limit_page_cnt / 2):
+                #     page_no += 1
+                #     continue
+                if page_count > limit_page_cnt and page_no >= limit_page_cnt:
                     page_no += 1
                     continue
 
@@ -222,6 +239,8 @@ class PDFConvert:
         delete_water_mark_list = []
 
         for layout, layout_obj_list, max_y, page_no in layout_list:
+            # for obj in layout_obj_list:
+            #     print('obj', obj)
             # 解析单页
             start_time = time.time()
             self._page = _Page(None, page_no)
@@ -251,7 +270,10 @@ class PDFConvert:
                 find_flag = 0
                 add_page_list = []
                 for page in pages:
-                    if not int(limit_page_cnt / 2) <= page_no < page_count - int(limit_page_cnt / 2):
+                    # if not int(limit_page_cnt / 2) <= page_no < page_count - int(limit_page_cnt / 2):
+                    #     page_no += 1
+                    #     continue
+                    if not (page_no >= limit_page_cnt):
                         page_no += 1
                         continue
 
@@ -297,9 +319,11 @@ class PDFConvert:
                     page_no += 1
 
                 if add_page_list:
-                    self._doc.children = self._doc.children[
-                                         :int(limit_page_cnt / 2)] + add_page_list + self._doc.children[
-                                                                                     int(limit_page_cnt / 2):]
+                    # self._doc.children = self._doc.children[:int(limit_page_cnt / 2)] \
+                    #                      + add_page_list \
+                    #                      + self._doc.children[int(limit_page_cnt / 2):]
+                    self._doc.children = self._doc.children[:limit_page_cnt] \
+                                         + add_page_list
 
         self.delete_same_image()
         # self.delete_bold_text_duplicate()
@@ -375,10 +399,14 @@ class PDFConvert:
 
         return pages, delete_footer_header_list
 
+    @memory_decorator
     def delete_bold_text_duplicate(self, lt_text_box_list):
         # 拿出所有LTChar
         lt_char_list = []
         for lt_text_box in lt_text_box_list:
+            if '.......' in lt_text_box.get_text():
+                # print('....... lt_text_box continue')
+                continue
             for lt_text_line in lt_text_box:
                 for lt_char in lt_text_line:
                     if isinstance(lt_char, LTChar):
@@ -447,14 +475,16 @@ class PDFConvert:
     def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
         list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list,
                                                                                     from_pdf=True, is_reverse=False)
-        self._page.in_table_objs = filter_objs
+        # self._page.in_table_objs = filter_objs
 
         # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
 
+        table_list = []
         for table in list_tables:
             _table = _Table(table["table"], table["bbox"])
             # self._page.children.append(_table)
             self._page.add_child(_table)
+            table_list.append(_table)
 
         list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
                                                         layout.bbox, page_no)
@@ -466,7 +496,7 @@ class PDFConvert:
         # pdf对象需反向排序
         # self._page.is_reverse = True
 
-        return list_tables
+        return table_list
 
     def is_text_legal(self, lt_text_list, page_no):
         # 无法识别pdf字符编码,整页用ocr
@@ -498,10 +528,11 @@ class PDFConvert:
 
         return True
 
+    @memory_decorator
     def judge_b_table(self, lt_text_list, table_list, page_no):
         table_h_list = []
         for table in table_list:
-            table_h_list.append([table.get('bbox')[1], table.get('bbox')[3]])
+            table_h_list.append([table.bbox[1], table.bbox[3]])
 
         # 先分行
         lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
@@ -528,6 +559,8 @@ class PDFConvert:
         row_cnt = 0
         b_table_row_list = []
         all_b_table = []
+        row_col_list = []
+        all_row_col_list = []
         for row in lt_text_row_list:
             # 水印行跳过
             if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
@@ -537,6 +570,7 @@ class PDFConvert:
             for r in row:
                 if re.search('[.·]{7,}', r.get_text()):
                     continue_flag = True
+                    all_row_col_list = []
                     break
             if continue_flag:
                 continue
@@ -550,6 +584,7 @@ class PDFConvert:
                     row_cnt += 1
                     t_cnt = 0
                     b_table_row_list += row
+                    row_col_list += [row]
                 else:
                     # 容忍
                     if t_cnt < tolerate_cnt:
@@ -557,15 +592,36 @@ class PDFConvert:
                         continue
                     if b_table_row_list and row_cnt >= is_b_table_cnt:
                         all_b_table.append(b_table_row_list)
+                        all_row_col_list.append(row_col_list)
                     row_cnt = 0
                     b_table_row_list = []
+                    row_col_list = []
             else:
                 row_cnt += 1
                 t_cnt = 0
                 b_table_row_list += row
+                row_col_list += [row]
 
         if b_table_row_list and row_cnt >= is_b_table_cnt:
             all_b_table.append(b_table_row_list)
+            all_row_col_list.append(row_col_list)
+            # print('b_table_row_list', b_table_row_list)
+
+        # 排除大部分是两列的,因为前面已经新增了两列无边框的单独识别
+        # print('len(all_row_col_list)', len(all_row_col_list))
+        row_cnt = 0
+        col_2_cnt = 0
+        for row_col_list in all_row_col_list:
+            for col_list in row_col_list:
+                row_cnt += 1
+                if len(col_list) == 2:
+                    col_2_cnt += 1
+                # print('col_list', col_list)
+
+        # print('row_cnt, col_2_cnt', row_cnt, col_2_cnt)
+        if row_cnt == 0 or col_2_cnt / row_cnt >= 0.5:
+            log("page_no: " + str(page_no) + ' is_b_table_flag False')
+            return False
 
         # 对每个可能的b_table判断是否与table相交
         is_b_table_flag = False
@@ -587,8 +643,35 @@ class PDFConvert:
                 # print('table_h_list', table_h_list)
                 break
         log("page_no: " + str(page_no) + ' is_b_table_flag ' + str(is_b_table_flag))
+        # 保存判断为True的pdf
+        # if is_b_table_flag:
+        #     self.save_b_table_pdf(page_no)
         return is_b_table_flag
 
+    def save_b_table_pdf(self, page_no):
+        # save_dir = r"D:\Project\format_conversion_maxcompute\save_b_table_pdf"
+        save_dir = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table_pdf'
+        max_index = 200
+        if os.path.exists(save_dir):
+            file_list = glob(save_dir + '/*')
+            if file_list:
+                file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
+                file_index_list.sort(key=lambda x: x)
+                index = file_index_list[-1] + 1
+            else:
+                index = 0
+            if index > max_index:
+                return
+        else:
+            return
+
+        save_path = f'{save_dir}/{index}-{page_no}.pdf'
+        try:
+            shutil.copy(self.path, save_path)
+            print("文件复制成功!")
+        except Exception as e:
+            print(f"文件复制失败:{e}")
+
     def char_to_text_box(self, char_list):
         lt_text_box_list = []
 
@@ -646,6 +729,7 @@ class PDFConvert:
 
         return lt_text_box_list, text_box_char_dict
 
+    @memory_decorator
     def get_need_objs(self, obj_list, max_y):
         # 文字
         lt_char_list = []
@@ -695,6 +779,14 @@ class PDFConvert:
             elif isinstance(x, (LTTextContainer, LTRect, LTLine, LTCurve)):
                 lt_line_list.append(x)
 
+        # print('len(obj_list)', len(obj_list))
+        # print('len(lt_char_list)', len(lt_char_list))
+        # print('len(lt_text_box_list)', len(lt_text_box_list))
+        # if len(lt_text_box_list) >= 200:
+        #     for lt_text in lt_text_box_list:
+        #         print('>= 200 lt_text', lt_text.get_text())
+        # print('len(lt_image_list)', len(lt_image_list))
+
         if lt_figure_list:
             temp_figure_list = []
             for sub_figure in lt_figure_list:
@@ -719,8 +811,21 @@ class PDFConvert:
 
         text_box_char_dict = {**text_box_char_dict, **add_text_box_char_dict}
 
+        lt_text_box_list = self.delete_water_mark_by_location(lt_text_box_list)
+
+        # 分行后过滤
+        temp_list = []
+        for lt_text_box in lt_text_box_list:
+            if lt_text_box.get_text() in ['', ' ', '\t', '\n', '\r']:
+                continue
+            temp_list.append(lt_text_box)
+        if len(lt_text_box_list) != len(temp_list):
+            log('filter lt_text_box_list ' + str(len(lt_text_box_list)) + ' -> ' + str(len(temp_list)))
+        lt_text_box_list = temp_list
+
         return lt_char_list, lt_text_box_list, lt_image_list, lt_figure_list, lt_line_list, text_box_char_dict
 
+    @memory_decorator
     def read_layout(self, page, page_no):
         layout = self.get_layout(page, page_no)
         if self._doc.error_code is not None:
@@ -834,6 +939,7 @@ class PDFConvert:
 
         return lt_text_box_list
 
+    @memory_decorator
     def split_text_box_by_lines2(self, lt_line_list, lt_text_box_list, text_box_char_dict):
         """
         有单个字符位置信息,再根据表格线截断位置,分割text
@@ -932,12 +1038,23 @@ class PDFConvert:
         return lt_text_box_list
 
     @memory_decorator
-    # def convert_page(self, page, page_no, skip_image=0):
     def convert_page(self, layout, layout_obj_list, max_y, page_no, delete_water_mark_list, skip_image=0):
         # 若Page中一个obj都无,后面ocr整页识别 20240820
         if max_y == 0 and len(layout_obj_list) > 0:
             return
 
+        # 若该页在page_need_to_image_dict中为True,则直接ocr整页识别
+        if self.page_need_to_image_dict.get(page_no) is True:
+            page_image = self.get_page_image(page_no)
+            if judge_error_code(page_image):
+                self._page.error_code = page_image
+            else:
+                _image = _Image(page_image[1], page_image[0])
+                _image.is_from_pdf = True
+                _image.is_reverse = False
+                self._page.add_child(_image)
+            return
+
         lt_char_list, lt_text_box_list, lt_image_list, lt_figure_list, \
             lt_line_list, text_box_char_dict = layout_obj_list
 
@@ -999,45 +1116,56 @@ class PDFConvert:
         # 正常读取该页对象
         else:
             # 图表对象
-            for image in lt_image_list:
-                try:
-                    # print("pdf2text LTImage size", page_no, image.width, image.height)
-                    image_stream = image.stream.get_data()
-                    # 小的图忽略
-                    if image.width <= 300 and image.height <= 300:
-                        continue
-                    # 查看提取的图片高宽,太大则用pdf输出图进行ocr识别
-                    img_test = Image.open(io.BytesIO(image_stream))
-                    if image.height >= 1000 and image.width >= 1000:
-                        page_image = self.get_page_image(page_no)
-                        if judge_error_code(page_image):
-                            self._page.error_code = page_image
-                        else:
-                            _image = _Image(page_image[1], page_image[0])
-                            _image.is_from_pdf = True
-                            _image.is_reverse = False
-                            self._page.add_child(_image)
-                            image_md5 = get_md5_from_bytes(page_image[1])
-                            self.md5_image_obj_list.append([image_md5, _image])
-                        return
-                    # 比较小的图则直接保存用ocr识别
-                    else:
-                        temp_path = self.unique_type_dir + 'page' + str(page_no) \
-                                    + '_lt' + str(lt_image_list.index(image)) + '.jpg'
-                        img_test.save(temp_path)
-                        with open(temp_path, "rb") as ff:
-                            image_stream = ff.read()
-                        _image = _Image(image_stream, temp_path, image.bbox)
-                        self._page.add_child(_image)
-                        image_md5 = get_md5_from_bytes(image_stream)
-                        self.md5_image_obj_list.append([image_md5, _image])
-                except Exception:
-                    log("page_no: " + str(page_no) + " pdfminer read image fail! use pymupdf read image...")
-                    traceback.print_exc()
+            # for image in lt_image_list:
+            #     try:
+            #         # print("pdf2text LTImage size", page_no, image.width, image.height)
+            #         # image_stream = image.stream.get_data()
+            #         print('image.stream.get_filters()', image.stream.get_filters())
+            #         image_stream = image.stream.get_data()
+            #         # 小的图忽略
+            #         if image.width <= 300 and image.height <= 300:
+            #             continue
+            #         # 查看提取的图片高宽,太大则用pdf输出图进行ocr识别
+            #         img_test = Image.open(io.BytesIO(image_stream))
+            #         # img_test = self.pdfminer_stream_to_image(image)
+            #         if image.height >= 1000 and image.width >= 1000:
+            #             page_image = self.get_page_image(page_no)
+            #             if judge_error_code(page_image):
+            #                 self._page.error_code = page_image
+            #             else:
+            #                 _image = _Image(page_image[1], page_image[0])
+            #                 _image.is_from_pdf = True
+            #                 _image.is_reverse = False
+            #                 self._page.add_child(_image)
+            #                 image_md5 = get_md5_from_bytes(page_image[1])
+            #                 self.md5_image_obj_list.append([image_md5, _image])
+            #             return
+            #         # 比较小的图则直接保存用ocr识别
+            #         else:
+            #             temp_path = self.unique_type_dir + 'page' + str(page_no) \
+            #                         + '_lt' + str(lt_image_list.index(image)) + '.jpg'
+            #             img_test.save(temp_path)
+            #             with open(temp_path, "rb") as ff:
+            #                 image_stream = ff.read()
+            #             _image = _Image(image_stream, temp_path, image.bbox)
+            #             self._page.add_child(_image)
+            #             image_md5 = get_md5_from_bytes(image_stream)
+            #             self.md5_image_obj_list.append([image_md5, _image])
+            #     except Exception:
+            #         log("page_no: " + str(page_no) + " pdfminer read image fail! use pymupdf read image...")
+            #         traceback.print_exc()
 
             # pdf对象需反向排序
             # self._page.is_reverse = True
 
+            status = self.pdfminer_read_page_images(lt_image_list, page_no)
+            if not status:
+                log('pymupdf 提取页面中图片 page_no: ' + str(page_no))
+                status = self.pymupdf_read_page_images(page_no)
+            if not status:
+                log('pymupdf 整页转化为图片 page_no: ' + str(page_no))
+                status = self.pymupdf_get_whole_page_image(page_no)
+
             if self.has_init_pdf[3] == 0:
                 self.init_package("pdfplumber")
 
@@ -1059,7 +1187,24 @@ class PDFConvert:
             table_list = self.recognize_text(layout, page_no, lt_text_box_list, lt_line_list)
 
             # 根据text规律,判断该页是否可能有无边框表格
+            try:
+                b_table_list, _ = get_b_table_by_blank_colon(lt_text_box_list, table_list, layout.bbox, None)
+            except:
+                traceback.print_exc()
+                b_table_list = []
+                self._page.error_code = [-23]
+
+            if b_table_list:
+                for table in b_table_list:
+                    _table = _Table(table[0], table[1])
+                    table_list += [_table]
+                    self._page.add_child(_table)
+
+            for t in table_list:
+                self._page.table_bbox_list.append(t.bbox)
+
             if self.judge_b_table(lt_text_box_list, table_list, page_no):
+                # log('judge_b_table match! ' + str(page_no))
                 page_image = self.get_page_image(page_no)
                 if judge_error_code(page_image):
                     self._page.error_code = page_image
@@ -1073,6 +1218,7 @@ class PDFConvert:
                     _image.b_table_layout_size = (layout.width, layout.height)
                     self._page.add_child(_image)
 
+    @memory_decorator
     def get_layout(self, page, page_no):
         if self.has_init_pdf[0] == 0:
             self.init_package("pdfminer")
@@ -1096,6 +1242,7 @@ class PDFConvert:
         log("page_no: " + str(page_no) + " get_layout cost: " + str(time.time() - start_time))
         return layout
 
+    @memory_decorator
     def get_page_image(self, page_no):
         start_time = time.time()
         try:
@@ -1503,6 +1650,7 @@ class PDFConvert:
             return [-12]
         return html
 
+    @memory_decorator
     def delete_water_mark(self, lt_text_list, page_bbox, times=5):
         # 删除过多重复字句,为水印
         duplicate_dict = {}
@@ -1540,6 +1688,32 @@ class PDFConvert:
                 temp_text_list.append(_obj)
         return temp_text_list, delete_text
 
+    @memory_decorator
+    def delete_water_mark_by_location(self, lt_text_box_list):
+        x_text_box_dict = {}
+        # 水印,x坐标相同,且长度为1
+        for lt_text_box in lt_text_box_list:
+            x1, y1, x2, y2 = lt_text_box.bbox
+            text = lt_text_box.get_text()
+            if len(text) != 1:
+                continue
+            key = f'{x1}-{x2}-{text}'
+            if key in x_text_box_dict:
+                x_text_box_dict[key] += [lt_text_box]
+            else:
+                x_text_box_dict[key] = [lt_text_box]
+
+        len1 = len(lt_text_box_list)
+        for key, box_list in x_text_box_dict.items():
+            if len(box_list) >= 3:
+                for box in box_list:
+                    if box in lt_text_box_list:
+                        lt_text_box_list.remove(box)
+        len2 = len(lt_text_box_list)
+        if len1 != len2:
+            log('delete_water_mark_by_location box num ' + str(len1) + ' -> ' + str(len2))
+        return lt_text_box_list
+
     def delete_water_mark_by_color(self, lt_text_list):
         # 删除浅色字体,大概率为水印
         # 1. 单个char颜色透明度0.8以上
@@ -1587,6 +1761,9 @@ class PDFConvert:
         water_mark_text_box_list = []
         sin_range = [0.3, 0.94]
         for lt_text_box in lt_text_list:
+            if '.......' in lt_text_box.get_text():
+                # print('....... lt_text_box continue')
+                continue
             for lt_text_line in lt_text_box:
                 for lt_char in lt_text_line:
                     matrix = lt_char.matrix
@@ -1634,6 +1811,126 @@ class PDFConvert:
             log("page_no: " + str(page_no) + " get_single_pdf error!")
             return [-3]
 
+    def pymupdf_read_page_images(self, page_no):
+        try:
+            self.init_package("PyMuPDF")
+            # 获取指定页面
+            page = self.doc_pymupdf.load_page(page_no)
+            # 获取页面中所有图片的信息
+            image_list = page.get_images(full=True)
+
+            # 存储提取的图片信息
+            extracted_images = []
+
+            # 遍历图片列表
+            for img_index, img_info in enumerate(image_list):
+                xref = img_info[0]  # 图片xref编号
+                base_image = self.doc_pymupdf.extract_image(xref)
+                image_bytes = base_image["image"]  # 图片字节数据
+                image_ext = base_image["ext"]  # 图片扩展名
+
+                # 获取图片在页面中的位置和大小
+                bbox = img_info[0:4]  # x0, y0, x1, y1
+                # print('img_info', img_info)
+                width = img_info[2] - img_info[0]  # 计算宽度
+                height = img_info[3] - img_info[1]  # 计算高度
+
+                # 构建图片信息字典
+                img_data = {
+                    "xref": xref,
+                    "width": width,
+                    "height": height,
+                    "image": image_bytes,
+                    "ext": image_ext,
+                    "bbox": bbox
+                }
+                extracted_images.append(img_data)
+
+            image_obj_list = []
+            for index, d in enumerate(extracted_images):
+                temp_path = self.unique_type_dir + 'page' + str(page_no) \
+                            + '_lt2_' + str(index) + '.jpg'
+                image_bytes = d.get("image")
+                bbox = d.get('bbox')
+                with open(temp_path, 'wb') as f:
+                    f.write(image_bytes)
+
+                _image = _Image(image_bytes, temp_path, bbox)
+                image_md5 = get_md5_from_bytes(image_bytes)
+                image_obj_list.append([_image, image_md5])
+        except:
+            traceback.print_exc()
+            return False
+
+        for _image, image_md5 in image_obj_list:
+            self._page.add_child(_image)
+            self.md5_image_obj_list.append([image_md5, _image])
+        return True
+
+    def pymupdf_get_whole_page_image(self, page_no):
+        image_obj_list = []
+        page_image = self.get_page_image(page_no)
+        if judge_error_code(page_image):
+            self._page.error_code = page_image
+            return False
+        else:
+            _image = _Image(page_image[1], page_image[0])
+            _image.is_from_pdf = True
+            _image.is_reverse = False
+            image_md5 = get_md5_from_bytes(page_image[1])
+            image_obj_list.append([_image, image_md5])
+
+        for _image, image_md5 in image_obj_list:
+            self._page.add_child(_image)
+            self.md5_image_obj_list.append([image_md5, _image])
+        return True
+
+    def pdfminer_read_page_images(self, lt_image_list, page_no):
+        # 图表对象
+        image_obj_list = []
+        for image in lt_image_list:
+            try:
+                # print("pdf2text LTImage size", page_no, image.width, image.height)
+                # image_stream = image.stream.get_data()
+                # print('image.stream.get_filters()', image.stream.get_filters())
+                image_stream = image.stream.get_data()
+                # 小的图忽略
+                if image.width <= 300 and image.height <= 300:
+                    continue
+                # 查看提取的图片高宽,太大则用pdf输出图进行ocr识别
+                img_test = Image.open(io.BytesIO(image_stream))
+                # img_test = self.pdfminer_stream_to_image(image)
+                # if image.height >= 1000 and image.width >= 1000:
+                #     page_image = self.get_page_image(page_no)
+                #     if judge_error_code(page_image):
+                #         self._page.error_code = page_image
+                #     else:
+                #         _image = _Image(page_image[1], page_image[0])
+                #         _image.is_from_pdf = True
+                #         _image.is_reverse = False
+                #         image_md5 = get_md5_from_bytes(page_image[1])
+                #         image_obj_list.append([_image, image_md5])
+                # # 比较小的图则直接保存用ocr识别
+                # else:
+                temp_path = self.unique_type_dir + 'page' + str(page_no) \
+                            + '_lt_' + str(lt_image_list.index(image)) + '.jpg'
+                img_test.save(temp_path)
+                with open(temp_path, "rb") as ff:
+                    image_stream = ff.read()
+                _image = _Image(image_stream, temp_path, image.bbox)
+                self._page.add_child(_image)
+                image_md5 = get_md5_from_bytes(image_stream)
+                self.md5_image_obj_list.append([image_md5, _image])
+            except Exception:
+                log("page_no: " + str(page_no) + " pdfminer read image fail!")
+                traceback.print_exc()
+                return False
+
+        for _image, image_md5 in image_obj_list:
+            self._page.add_child(_image)
+            self.md5_image_obj_list.append([image_md5, _image])
+        return True
+
 
 def get_text_font():
     def flags_decomposer(flags):
@@ -1999,4 +2296,8 @@ class ParseUtils:
 
 
 if __name__ == '__main__':
-    PDFConvert(r"C:/Users/Administrator/Downloads/1651896704621.pdf", "C:/Users/Administrator/Downloads/1").get_html()
+    _pp = r'D:\Project\format_conversion_maxcompute\save_b_table_pdf/e-116.pdf'
+    # _pp = r'C:\Users\Administrator\Downloads\1746582280828.pdf'
+    _html = PDFConvert(_pp, r"D:\Project\format_conversion_maxcompute\format_convert\temp", None).get_html()
+    with open('../result.html', 'w', encoding='utf-8') as f:
+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])

+ 30 - 36
format_convert/convert_test.py

@@ -11,15 +11,6 @@ from glob import glob
 import requests
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
-from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTLine
-from pdfminer.pdfdocument import PDFDocument
-from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
-from pdfminer.pdfpage import PDFPage
-from pdfminer.pdfparser import PDFParser
-from pdfplumber import PDF
-
-from otr.table_line_pdf import _plot
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 from format_convert.utils import get_platform, request_post, get_md5_from_bytes
@@ -44,7 +35,7 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range,
             'timeout': timeout, 'save_middle': save_middle}
 
-    # _url = 'http://121.46.18.113:15010/convert'
+    # _url = 'http://dianxin.bidizhaobiao.com:15010/convert'
     # _url = 'http://192.168.2.103:15010/convert'
     # _url = 'http://192.168.2.102:15010/convert'
     # _url = 'http://172.16.160.65:15010/convert'
@@ -53,7 +44,7 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
     text_str = ""
     try:
         result = json.loads(request_post(_url, data, time_out=timeout+20))
-
+        print('result', result)
         for t in result.get("result_html"):
             text_str += t
         to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
@@ -67,7 +58,7 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
                 to_html(new_path, text_str)
 
         print(_md5)
-        print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
+        # print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
         print("result_text", result.get("result_text")[0][:20])
         print("is_success", result.get("is_success"))
     except:
@@ -80,7 +71,6 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
     return p, 1
 
 
-
 def test_path():
     # _url = 'http://121.46.18.113:15010/convert'
     _url = 'http://192.168.0.115:15010/convert'
@@ -186,21 +176,25 @@ def test_kimi():
 
 if __name__ == '__main__':
     if get_platform() == "Windows":
-        # file_path = "C:/Users/Administrator/Downloads/1672314827836.pdf"
+        # file_path = "C:/Users/Administrator/Downloads/1750737587843.ofd"
+        # file_path = r'D:\Project\format_conversion_maxcompute\save_b_table_pdf/e-1.pdf'
         # file_path = "D:/BIDI_DOC/比地_文档/1677829036789.pdf"
 
-        # file_path = "C:/Users/Administrator/Desktop/test_xls/error7.xls"
-        # file_path = "C:/Users/Administrator/Desktop/test_doc/error15.doc"
-        # file_path = "C:/Users/Administrator/Desktop/test_swf/error1.swf"
+        # file_path = "C:/Users/Administrator/Desktop/test_xls/error4.xlsx"
+        # file_path = "C:/Users/Administrator/Desktop/test_doc/error17.docx"
+        # file_path = "C:/Users/Administrator/Desktop/test_swf/error2.swf"
         # file_path = "C:/Users/Administrator/Desktop/test_rar/error1.rar"
-        file_path = "C:/Users/Administrator/Desktop/test_image/error7.png"
-        # file_path = "C:/Users/Administrator/Desktop/test_b_table/error13.pdf"
-        # file_path = "C:/Users/Administrator/Desktop/test_pdf/表格连接error/error6.pdf"
+        # file_path = "C:/Users/Administrator/Desktop/test_image/error18.png"
+        # file_path = "C:/Users/Administrator/Desktop/test_b_table/error29.png"
+        # file_path = "C:/Users/Administrator/Desktop/test_pdf/普通error/error6.pdf"
         # file_path = "C:/Users/Administrator/Desktop/test_table_head/error2.pdf"
+        # file_path = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
+        file_path = "C:/Users/Administrator/Desktop/test_ofd/1750381792388.ofd"
     else:
         file_path = "1660296734009.pdf"
 
-    test_one(file_path, page_no_range='1,-1', timeout=1000, save_middle=None)
+    # test_one(file_path, page_no_range="1,-1", timeout=1000, save_middle=None)
+    test_one(file_path, page_no_range=None, timeout=1000, save_middle=None)
 
     # run_files()
 
@@ -212,21 +206,21 @@ if __name__ == '__main__':
     # file_path = r"C:\Users\Administrator\Desktop\test_pdf\直接读表格线error/"
     # file_path = r"C:\Users\Administrator\Desktop\test_pdf\表格连接error/"
     # file_path = r"C:\Users\Administrator\Desktop\test_b_table/"
-    file_path = r"C:\Users\Administrator\Desktop\test_pdf\普通error/"
-    test_pdf_list = [['6df7f2bd5e8cac99a15a6c012e0d82a8.pdf', '34,52'],
-                     ['ca6a86753400d6dd6a1b324c5678b7fb.pdf', '18,69'],
-                     ['a8380bf795c71caf8185fb11395df138.pdf', '27,38'],
-                     ['7fd2ce6b08d086c98158b6f2fa0293b0.pdf', '32,48'],
-                     ['dd1adb4dc2014c7abcf403ef15a01eb5.pdf', '2,12'],
-                     ['error50.pdf', '1,-1'],
-                     ['error59.pdf', '1,-1'],
-                     ['error60.pdf', '1,-1'],
-                     ['error61.pdf', '1,-1'],
-                     ['error7.pdf', '39,57'],
-                     ['error8.pdf', '7,12'],
-                     ['error23.pdf', '1,-1']
-                     ]
-    index = 11
+    # file_path = r"C:\Users\Administrator\Desktop\test_pdf\普通error/"
+    # test_pdf_list = [['6df7f2bd5e8cac99a15a6c012e0d82a8.pdf', '34,52'],
+    #                  ['ca6a86753400d6dd6a1b324c5678b7fb.pdf', '18,69'],
+    #                  ['a8380bf795c71caf8185fb11395df138.pdf', '27,38'],
+    #                  ['7fd2ce6b08d086c98158b6f2fa0293b0.pdf', '32,48'],
+    #                  ['dd1adb4dc2014c7abcf403ef15a01eb5.pdf', '2,12'],
+    #                  ['error50.pdf', '1,-1'],
+    #                  ['error59.pdf', '1,-1'],
+    #                  ['error60.pdf', '1,-1'],
+    #                  ['error61.pdf', '1,-1'],
+    #                  ['error7.pdf', '39,57'],
+    #                  ['error8.pdf', '7,12'],
+    #                  ['error23.pdf', '1,-1']
+    #                  ]
+    # index = 11
     # test_one(file_path+test_pdf_list[index][0], page_no_range=test_pdf_list[index][1], from_remote=True)
 
 

+ 91 - 4
format_convert/convert_tree.py

@@ -61,6 +61,8 @@ class _Page:
         self.in_table_objs = set()
         # 是否pdf
         self.is_pdf = 0
+        # 所有表格范围
+        self.table_bbox_list = []
 
     def add_child(self, child):
         if child.error_code is None:
@@ -74,12 +76,66 @@ class _Page:
 
         self.children = sort_object(self.children, self.is_reverse)
 
+        # 有图片类型,需返回图片中所有对象,并重新设置图片中的bbox,以及图片后的对象的bbox
+        image_add_y = 0
+        add_childern = []
+        for child in self.children:
+            if type(child) == _Image:
+                image_children = child.get_html(return_children=True)
+                if judge_error_code(image_children) and not self.is_pdf:
+                    self.error_code = image_children
+                    return self.error_code
+                if len(image_children) == 0:
+                    continue
+                image_children = sort_object(image_children, False)
+
+                # 单张图可能无bbox,但文档中的图有bbox
+                if child.bbox != (0, 0, 0, 0):
+                    for i_child in image_children:
+                        i_child.bbox = [i_child.bbox[0], i_child.bbox[1] + child.bbox[3] + image_add_y,
+                                        i_child.bbox[2], i_child.bbox[3] + child.bbox[3] + image_add_y
+                                        ]
+
+                image_add_y += image_children[-1].bbox[3]
+                add_childern += image_children
+                continue
+
+            # 图片对象后面的对象,bbox重新设置
+            child.bbox = [child.bbox[0], child.bbox[1] + image_add_y,
+                          child.bbox[2], child.bbox[3] + image_add_y
+                          ]
+            # self.children += child.get_html(return_children=True)
+
+        self.children += add_childern
+        self.children = sort_object(self.children, self.is_reverse)
+
+        # 获取所有table,计算bbox,排除在table中的sentence
+        for child in self.children:
+            if type(child) == _Table:
+                # table_bbox = get_table_bbox(child.content)
+                # print('table.content ', child.content)
+                # print('child.bbox', child.bbox)
+                self.table_bbox_list += [child.bbox]
+
         html_text = ""
         image_html = ""
         text_html = ""
         for child in self.children:
+            if type(child) == _Image:
+                continue
+            if type(child) == _Sentence:
+                continue_flag = 0
+                for table_bbox in self.table_bbox_list:
+                    # print('table_bbox', table_bbox)
+                    if table_bbox[1] - 3 <= child.bbox[1] <= child.bbox[3] <= table_bbox[3] + 3:
+                        continue_flag = 1
+                        break
+                if continue_flag:
+                    continue
+
             # 先调用get_html才能更新error_code
             child_html_text = child.get_html()
+            # print('sort child_html_text', child_html_text)
             if child.error_code is not None:
                 self.error_code = child.error_code
                 return ""
@@ -158,14 +214,16 @@ class _Image:
         else:
             self.error_code = child.error_code
 
-    def get_html(self):
+    def get_html(self, return_children=False):
         # 将Image转为Sentence,table
         self.convert()
         # if self.error_code == [-16]:
         #     self.error_code = None
         #     return "<div>#idc error#<div>"
         if self.error_code is not None:
-            return ""
+            return self.error_code
+        if return_children:
+            return self.children
 
         html_text = ""
         self.children = sort_object(self.children)
@@ -192,7 +250,9 @@ class _Image:
                                  self.b_table_layout_size, self.is_reverse)
         if judge_error_code(obj_list):
             # 20241101 注释 图片识别报错返回空
-            # self.error_code = obj_list
+            # 20250604 不是来源pdf的,返回错误码
+            if not self.is_from_pdf:
+                self.error_code = obj_list
             return
 
         if self.b_table_from_text:
@@ -213,9 +273,19 @@ class _Table:
         self.bbox = bbox
         self.x = bbox[0]
         self.y = bbox[1]
-        self.shape = (len(content), len(content[0]))
+        if len(content) and len(content[0]):
+            self.shape = (len(content), len(content[0]))
+        else:
+            self.shape = (0, 0)
         self.error_code = None
 
+    def get_table_bbox(self, table):
+        x1 = min([y.bbox[0] for x in table for y in x])
+        y1 = min([y.bbox[1] for x in table for y in x])
+        x2 = max([y.bbox[2] for x in table for y in x])
+        y2 = max([y.bbox[3] for x in table for y in x])
+        return [x1, y1, x2, y2]
+
     def get_html(self):
         if self.error_code is not None:
             return ""
@@ -227,6 +297,9 @@ class _Table:
             html_text = get_table_html(self.content)
             return html_text
 
+    def __repr__(self):
+        return '(%s@#@%s)' % (str('table'), '@'.join([str(x) for x in self.bbox]))
+
 
 class _Sentence:
     def __init__(self, content, bbox, is_html=False):
@@ -249,6 +322,9 @@ class _Sentence:
         else:
             return add_div(self.content)
 
+    def __repr__(self):
+        return '(%s@#@%s)' % (str(self.content), '@'.join([str(x) for x in self.bbox]))
+
 
 class TextBox:
     def __init__(self, bbox, text):
@@ -261,6 +337,17 @@ class TextBox:
     def __str__(self):
         return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))
 
+    def __repr__(self):
+        return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))
+
+    def __hash__(self):
+        return hash(self.__str__())
+
+    def __eq__(self, other):
+        if isinstance(other, TextBox):
+            return self.__str__() == other.__str__()
+        return False
+
 
 class TableLine:
     def __init__(self, bbox):

+ 61 - 0
format_convert/convert_wps.py

@@ -0,0 +1,61 @@
+import os
+import re
+import sys
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
+from format_convert.convert_tree import _Document, _Sentence, _Page
+import logging
+import traceback
+from format_convert.convert_doc import DocConvert
+from format_convert.utils import judge_error_code, get_logger, log
+
+
+class WpsConvert:
+    def __init__(self, path, unique_type_dir):
+        self._doc = _Document(path)
+        self.path = path
+        self.unique_type_dir = unique_type_dir
+
+    def convert(self):
+        # 改后缀,调用doc处理
+        print('self.path', self.path)
+        file_name = re.split('[/\\\]', self.path)[-1]
+        with open(self.path, 'rb') as file:
+            content = file.read()
+
+        new_file_name = file_name[:-4] + '.doc'
+        new_file_path = self.unique_type_dir + new_file_name
+        print('new_file_path', new_file_path)
+        with open(new_file_path, 'wb') as file:
+            file.write(content)
+
+        log('wps file ' + file_name + ' -> ' + new_file_name)
+
+        self._doc_convert = DocConvert(new_file_path, self.unique_type_dir)
+        self._doc_convert.convert()
+        self._doc = self._doc_convert._doc
+
+    def get_html(self):
+        try:
+            self.convert()
+        except:
+            traceback.print_exc()
+            self._doc.error_code = [-1]
+
+        # 直接返回doc处理的html
+        if self._doc.error_code is not None:
+            return self._doc.error_code
+        else:
+            return self._doc.get_html()
+
+
+if __name__ == '__main__':
+    _p = "C:/Users/Administrator/Downloads/1723004790329.wps"
+    # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
+    c = WpsConvert(_p, save_dir)
+    _html = c.get_html()
+    with open('../result.html', 'w', encoding='utf-8') as f:
+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])
+
+

+ 6 - 0
format_convert/easyofd/easyofd/__init__.py

@@ -0,0 +1,6 @@
+from .ofd import OFD
+__version__ = "0.5.1"
+__author__ = "renoyuan"
+__email__ = "renoyuan@foxmail.com"
+__description__ = "一个用于OFD文档处理的Python库"
+__all__ = ["OFD"]

+ 474 - 0
format_convert/easyofd/easyofd/chinese_characters.txt

@@ -0,0 +1,474 @@
+豈
+更
+車
+賈
+滑
+串
+句
+龜
+龜
+契
+金
+喇
+奈
+懶
+癩
+羅
+蘿
+螺
+裸
+邏
+樂
+洛
+烙
+珞
+落
+酪
+駱
+亂
+卵
+欄
+爛
+蘭
+鸞
+嵐
+濫
+藍
+襤
+拉
+臘
+蠟
+廊
+朗
+浪
+狼
+郎
+來
+冷
+勞
+擄
+櫓
+爐
+盧
+老
+蘆
+虜
+路
+露
+魯
+鷺
+碌
+祿
+綠
+菉
+錄
+鹿
+論
+壟
+弄
+籠
+聾
+牢
+磊
+賂
+雷
+壘
+屢
+樓
+淚
+漏
+累
+縷
+陋
+勒
+肋
+凜
+凌
+稜
+綾
+菱
+陵
+讀
+拏
+樂
+諾
+丹
+寧
+怒
+率
+異
+北
+磻
+便
+復
+不
+泌
+數
+索
+參
+塞
+省
+葉
+說
+殺
+辰
+沈
+拾
+若
+掠
+略
+亮
+兩
+凉
+梁
+糧
+良
+諒
+量
+勵
+呂
+女
+廬
+旅
+濾
+礪
+閭
+驪
+麗
+黎
+力
+曆
+歷
+轢
+年
+憐
+戀
+撚
+漣
+煉
+璉
+秊
+練
+聯
+輦
+蓮
+連
+鍊
+列
+劣
+咽
+烈
+裂
+說
+廉
+念
+捻
+殮
+簾
+獵
+令
+囹
+寧
+嶺
+怜
+玲
+瑩
+羚
+聆
+鈴
+零
+靈
+領
+例
+禮
+醴
+隸
+惡
+了
+僚
+寮
+尿
+料
+樂
+燎
+療
+蓼
+遼
+龍
+暈
+阮
+劉
+杻
+柳
+流
+溜
+琉
+留
+硫
+紐
+類
+六
+戮
+陸
+倫
+崙
+淪
+輪
+律
+慄
+栗
+率
+隆
+利
+吏
+履
+易
+李
+梨
+泥
+理
+痢
+罹
+裏
+裡
+里
+離
+匿
+溺
+吝
+燐
+璘
+藺
+隣
+鱗
+麟
+林
+淋
+臨
+立
+笠
+粒
+狀
+炙
+識
+什
+茶
+刺
+切
+度
+拓
+糖
+宅
+洞
+暴
+輻
+行
+降
+見
+廓
+兀
+嗀
+﨎
+﨏
+塚
+﨑
+晴
+﨓
+﨔
+凞
+猪
+益
+礼
+神
+祥
+福
+靖
+精
+羽
+﨟
+蘒
+﨡
+諸
+﨣
+﨤
+逸
+都
+﨧
+﨨
+﨩
+飯
+飼
+館
+鶴
+郞
+隷
+侮
+僧
+免
+勉
+勤
+卑
+喝
+嘆
+器
+塀
+墨
+層
+屮
+悔
+慨
+憎
+懲
+敏
+既
+暑
+梅
+海
+渚
+漢
+煮
+爫
+琢
+碑
+社
+祉
+祈
+祐
+祖
+祝
+禍
+禎
+穀
+突
+節
+練
+縉
+繁
+署
+者
+臭
+艹
+艹
+著
+褐
+視
+謁
+謹
+賓
+贈
+辶
+逸
+難
+響
+頻
+恵
+𤋮
+舘
+﩮
+﩯
+並
+况
+全
+侀
+充
+冀
+勇
+勺
+喝
+啕
+喙
+嗢
+塚
+墳
+奄
+奔
+婢
+嬨
+廒
+廙
+彩
+徭
+惘
+慎
+愈
+憎
+慠
+懲
+戴
+揄
+搜
+摒
+敖
+晴
+朗
+望
+杖
+歹
+殺
+流
+滛
+滋
+漢
+瀞
+煮
+瞧
+爵
+犯
+猪
+瑱
+甆
+画
+瘝
+瘟
+益
+盛
+直
+睊
+着
+磌
+窱
+節
+类
+絛
+練
+缾
+者
+荒
+華
+蝹
+襁
+覆
+視
+調
+諸
+請
+謁
+諾
+諭
+謹
+變
+贈
+輸
+遲
+醙
+鉶
+陼
+難
+靖
+韛
+響
+頋
+頻
+鬒
+龜
+𢡊
+𢡄
+𣏕
+㮝
+䀘
+䀹
+𥉉
+𥳐
+𧻓
+齃
+龎

+ 23 - 0
format_convert/easyofd/easyofd/draw/__init__.py

@@ -0,0 +1,23 @@
+import os
+import sys
+
+from reportlab.pdfbase import pdfmetrics
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../../../../")
+from format_convert.easyofd.easyofd.parser_ofd import *
+
+FONTS = ['宋体',"SWPMEH+SimSun",'SimSun','KaiTi','楷体',"STKAITI","SWLCQE+KaiTi",
+         'Courier New','STSong-Light',"CourierNew","SWANVV+CourierNewPSMT",
+         "CourierNewPSMT","BWSimKai","hei","黑体","SimHei","SWDKON+SimSun",
+         "SWCRMF+CourierNewPSMT","SWHGME+KaiTi"]
+
+from .font_tools import FontTool
+from .draw_pdf import DrawPDF
+from .draw_ofd import OFDWrite
+
+
+
+
+
+
+    

+ 290 - 0
format_convert/easyofd/easyofd/draw/draw_ofd.py

@@ -0,0 +1,290 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: F:\code\easyofd\easyofd\draw
+# CREATE_TIME: 2023-10-26
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# note:  写入 xml 目录并打包成ofd 文件
+from datetime import datetime
+from io import BytesIO
+from typing import Optional
+
+from PIL import Image
+from loguru import logger
+
+from .ofdtemplate import CurId, OFDTemplate, DocumentTemplate, DocumentResTemplate, PublicResTemplate, ContentTemplate, \
+    OFDStructure
+from .pdf_parse import DPFParser
+
+
+class OFDWrite(object):
+    """
+    写入ofd 工具类
+    """
+
+    def __init__(self, ):
+        self.OP = 200 / 25.4
+        # self.OP = 1
+
+    def build_ofd_entrance(self, id_obj: Optional[CurId] = None):
+        """
+        build_ofd_entrance
+        """
+        CreationDate = str(datetime.now())
+        ofd_entrance = OFDTemplate(CreationDate=CreationDate, id_obj=id_obj)
+        return ofd_entrance
+
+    def build_document(self, img_len, id_obj: Optional[CurId] = None, PhysicalBox: Optional[str] = "0 0 140 90"):
+        """
+        build_document
+        """
+        pages = []
+
+        for idx in range(img_len):
+            pages.append(
+                {
+                    "@ID": f"{idx + 1}",
+                    "@BaseLoc": f"Pages/Page_{idx}/Content.xml"
+                }
+            )
+        document = DocumentTemplate(Page=pages, id_obj=id_obj, PhysicalBox=PhysicalBox)
+        return document
+
+    def build_document_res(self, img_len: int = 0, id_obj: Optional[CurId] = None,
+                           pfd_res_uuid_map: Optional[dict] = None):
+        """
+        build_document_res
+        """
+        MultiMedia = []
+        DrawParams = []  # todo DrawParams 参数后面有空增加
+        pfd_img = None
+        if pfd_res_uuid_map:
+            pfd_img = pfd_res_uuid_map.get("img")
+
+        if img_len and not pfd_res_uuid_map:
+            for num in range(img_len):
+                MultiMedia.append({
+                    "@ID": 0,
+                    "@Type": "Image",
+                    "ofd:MediaFile": f"Image_{num}.jpg",
+                    "res_uuid": f"{num}",
+                })
+        elif pfd_res_uuid_map and pfd_img:
+            for res_uuid in pfd_img.keys():
+                name = f"Image_{res_uuid}.jpg"
+                MultiMedia.append({
+                    "@ID": 0,
+                    "@Type": "Image",
+                    "ofd:MediaFile": name,
+                    "res_uuid": res_uuid,
+
+                })
+
+        document_res = DocumentResTemplate(MultiMedia=MultiMedia, id_obj=id_obj)
+        return document_res
+
+    def build_public_res(self, id_obj: CurId = None, pfd_res_uuid_map: dict = None):
+        """
+        build_public_res
+        """
+        fonts = []
+
+        pfd_font = None
+        if pfd_res_uuid_map:
+            pfd_font = pfd_res_uuid_map.get("font")
+
+        if pfd_res_uuid_map and pfd_font:
+            for res_uuid, font in pfd_font.items():
+                fonts.append({
+                    "@ID": 0,
+                    "@FontName": font,
+                    "@FamilyName": font,  # 匹配替代字型
+                    "res_uuid": res_uuid,
+                    "@FixedWidth": "false",
+                    "@Serif": "false",
+                    "@Bold": "false",
+                    "@Charset": "prc"
+                })
+        else:
+            pass
+
+        public_res = PublicResTemplate(Font=fonts, id_obj=id_obj)
+        return public_res
+
+    def build_content_res(self, pil_img_list=None, pdf_info_list=None, id_obj: CurId = None,
+                          pfd_res_uuid_map: dict = None):
+        """
+        pil_img_list - >一张图片是一页
+        content_res -> 写入 pdf 信息
+        """
+        PhysicalBox = None
+        content_res_list = []
+        if pil_img_list:
+            for idx, pil_img in enumerate(pil_img_list):
+                # print(pil_img)
+                # print(idx, pil_img[1], pil_img[2])
+                PhysicalBox = f"0 0 {pil_img[1]} {pil_img[2]}"
+                ImageObject = [{
+                    "@ID": 0,
+                    "@CTM": f"{pil_img[1]} 0 0 {pil_img[2]} 0 0",
+                    "@Boundary": f"0 0 {pil_img[1]} {pil_img[2]}",
+                    "res_uuid": f"{idx}",  # 资源标识
+                    "@ResourceID": f""
+                }]
+
+                conten = ContentTemplate(PhysicalBox=PhysicalBox, ImageObject=ImageObject,
+
+                                         CGTransform=[], PathObject=[], TextObject=[], id_obj=id_obj)
+                # print(conten)
+                content_res_list.append(conten)
+        elif pdf_info_list:  # 写入读取后的pdf 结果 # todo 图片id 需要关联得提前定义或者有其他方式反向对齐
+
+            for idx, content in enumerate(pdf_info_list):
+                ImageObject = []
+                TextObject = []
+                PhysicalBox = pfd_res_uuid_map["other"]["page_size"][idx]
+                PhysicalBox = f"0 0 {PhysicalBox[0]} {PhysicalBox[1]}"  # page_size 没有的话使用document 里面的
+                for block in content:
+                    # print(block)
+
+                    bbox = block['bbox']
+                    x0, y0, length, height = bbox[0] / self.OP, bbox[1] / self.OP, (bbox[2] - bbox[0]) / self.OP, (
+                            bbox[3] - bbox[1]) / self.OP
+                    if block["type"] == "text":
+
+                        count = len(block.get("text"))
+
+                        TextObject.append({
+                            "@ID": 0,
+                            "res_uuid": block.get("res_uuid"),  # 资源标识
+                            "@Font": "",
+                            "ofd:FillColor": {"Value": "156 82 35"},
+
+                            "ofd:TextCode": {
+                                "#text": block.get("text"),
+                                "@X": "0",
+                                "@Y": f"{block.get('size') / self.OP}",
+                                "@DeltaX": f"g {count - 1} {length / count}"
+                            },
+
+                            "@size": block.get("size") / self.OP,
+                            "@Boundary": f"{x0} {y0} {length} {height}",
+
+                        })
+                    elif block["type"] == "img":
+                        ImageObject.append({
+                            "@ID": 0,
+                            "res_uuid": block.get("res_uuid"),  # 资源标识
+
+                            "@Boundary": f"{x0} {y0} {length} {height}",
+                            "@ResourceID": f""  # 需要关联public res 里面的结果
+
+                        })
+
+                # for i in content:
+                #     if i["type"] == "img":
+                #         ImageObject.append(i)
+                #     elif i["type"] == "text":
+                #         TextObject.append(i)
+
+                conten = ContentTemplate(PhysicalBox=PhysicalBox, ImageObject=ImageObject,
+
+                                         CGTransform=[], PathObject=[], TextObject=TextObject, id_obj=id_obj)
+                # print(conten)
+                content_res_list.append(conten)
+        else:
+            pass
+        return content_res_list
+
+    def pil_2_bytes(self, image):
+        """"""
+        # 创建一个 BytesIO 对象
+        img_bytesio = BytesIO()
+
+        # 将图像保存到 BytesIO 对象
+        image.save(img_bytesio, format='PNG')  # 你可以根据需要选择其他图像格式
+
+        # 获取 BytesIO 对象中的字节
+        img_bytes = img_bytesio.getvalue()
+
+        # 关闭 BytesIO 对象
+        img_bytesio.close()
+        return img_bytes
+
+    def __call__(self, pdf_bytes=None, pil_img_list=None, optional_text=False):
+        """
+        input pdf | imgs if pdf  >optional_text or not
+        0 解析pdf文件
+        1 构建必要的ofd template
+        2 转化为 ofd
+        """
+        pdf_obj = DPFParser()
+        page_pil_img_list = None
+
+        # 插入图片ofd
+        if pil_img_list:  # 读取 图片
+            page_pil_img_list = [(self.pil_2_bytes(_img), _img.size[0] / self.OP, _img.size[1] / self.OP) for _img in
+                                 pil_img_list]
+        else:  # 读取 pdf 转图片
+            if optional_text:  # 生成可编辑ofd:
+                pdf_info_list, pfd_res_uuid_map = pdf_obj.extract_text_with_details(pdf_bytes)  # 解析pdf
+                # logger.debug(f"pdf_info_list: {pdf_info_list} \n pfd_res_uuid_map {pfd_res_uuid_map}")
+            else:
+                img_list = pdf_obj.to_img(pdf_bytes)
+                page_pil_img_list = [(self.pil_2_bytes(Image.frombytes("RGB", [_img.width, _img.height],
+                                                                       _img.samples)), _img.width / self.OP,
+                                      _img.height / self.OP) for _img in img_list]
+
+        id_obj = CurId()
+
+        if page_pil_img_list:  # img 内容转ofd
+            res_static = {}  # 图片资源
+            pfd_res_uuid_map = {"img": {}}
+            PhysicalBox = f"0 0 {page_pil_img_list[0][1]} {page_pil_img_list[0][2]}"
+            for idx, pil_img_tuple in enumerate(page_pil_img_list):
+                pfd_res_uuid_map["img"][f"{idx}"] = pil_img_tuple[0]
+                res_static[f"Image_{idx}.jpg"] = pil_img_tuple[0]
+            ofd_entrance = self.build_ofd_entrance(id_obj=id_obj)
+            document = self.build_document(len(page_pil_img_list), id_obj=id_obj, PhysicalBox=PhysicalBox)
+            public_res = self.build_public_res(id_obj=id_obj)
+            document_res = self.build_document_res(len(page_pil_img_list), id_obj=id_obj,
+                                                   pfd_res_uuid_map=pfd_res_uuid_map)
+
+            content_res_list = self.build_content_res(page_pil_img_list, id_obj=id_obj,
+                                                      pfd_res_uuid_map=pfd_res_uuid_map)
+
+
+        else:
+            #  生成的文档结构对象需要传入id实例
+            ofd_entrance = self.build_ofd_entrance(id_obj=id_obj)
+            document = self.build_document(len(pdf_info_list), id_obj=id_obj)
+            public_res = self.build_public_res(id_obj=id_obj, pfd_res_uuid_map=pfd_res_uuid_map)
+            document_res = self.build_document_res(len(pdf_info_list), id_obj=id_obj, pfd_res_uuid_map=pfd_res_uuid_map)
+            content_res_list = self.build_content_res(pdf_info_list=pdf_info_list, id_obj=id_obj,
+                                                      pfd_res_uuid_map=pfd_res_uuid_map)
+
+            res_static = {}  # 图片资源
+
+            print("pfd_res_uuid_map", pfd_res_uuid_map)
+            img_dict = pfd_res_uuid_map.get("img")
+            if img_dict:
+                for key, v_io in img_dict.items():
+                    res_static[f"Image_{key}.jpg"] = v_io.getvalue()
+
+        # 生成 ofd 文件
+        ofd_byte = OFDStructure("123", ofd=ofd_entrance, document=document, public_res=public_res,
+                                document_res=document_res, content_res=content_res_list, res_static=res_static)(
+            test=True)
+        return ofd_byte
+
+
+if __name__ == "__main__":
+    pdf_p = r"D:\renodoc\技术栈\GBT_33190-2016_电子文件存储与交换格式版式文档.pdf"
+    pdf_p = r"F:\code\easyofd\test"
+    with open(pdf_p, "rb") as f:
+        content = f.read()
+
+    ofd_content = OFDWrite()(content)
+
+    with open("ofd.ofd", "wb") as f:
+        f.write(ofd_content)

+ 1178 - 0
format_convert/easyofd/easyofd/draw/draw_pdf.py

@@ -0,0 +1,1178 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: E:\code\easyofd\easyofd\draw
+# CREATE_TIME: 2023-08-10
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE:  绘制pdf
+import base64
+import math
+import os
+import re
+import sys
+import traceback
+from io import BytesIO
+
+from PIL import Image as PILImage, Image, ImageFont, ImageDraw
+from fontTools.ttLib import TTFont
+from loguru import logger
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.utils import ImageReader
+from reportlab.pdfgen import canvas
+
+from format_convert.utils import special_font_to_normal, image_resize_by_ratio
+
+sys.path.append(os.path.dirname(__file__) + "/../../../../")
+from format_convert.easyofd.easyofd.draw.font_tools import FontTool
+from .find_seal_img import SealExtract
+
+
+# print(reportlab_fonts)
+class DrawPDF():
+    """
+    ofd 解析结果 绘制pdf
+    OP ofd 单位转换
+    """
+
+    def __init__(self, data, *args, **kwargs):
+        assert data, "未输入ofd解析结果"
+        self.data = data
+        self.author = "renoyuan"
+        self.OP = 200 / 25.4
+        # self.OP = 1
+        self.pdf_uuid_name = self.data[0]["pdf_name"]
+        self.pdf_io = BytesIO()
+        self.SupportImgType = ("JPG", "JPEG", "PNG")
+        self.init_font = "宋体"
+        self.font_tool = FontTool()
+        self.page_need_to_image_dict = {}
+
+    def draw_lines(my_canvas):
+        """
+        draw_line
+        """
+        my_canvas.setLineWidth(.3)
+
+        start_y = 710
+        my_canvas.line(30, start_y, 580, start_y)
+
+        for x in range(10):
+            start_y -= 10
+            my_canvas.line(30, start_y, 580, start_y)
+
+    def gen_empty_pdf(self):
+        """
+        """
+        c = canvas.Canvas(self.pdf_io)
+        c.setPageSize(A4)
+        c.setFont(self.init_font, 20)
+        c.drawString(0, 210, "ofd 格式错误,不支持解析", mode=1)
+        c.save()
+
+    # 单个字符偏移量计算
+    def cmp_offset(self, pos, offset, DeltaRule, text, CTM_info, dire="X") -> list:
+        """
+        pos 文本框x|y 坐标 
+        offset 第一个字符的X|Y 
+        DeltaRule 偏移量规则
+        resize 字符坐标缩放
+        返回 x|y  字符位置list 
+        """
+        if CTM_info and dire == "X":
+            resize = CTM_info.get("resizeX")
+            rotate = CTM_info.get("rotateX")
+            move = CTM_info.get("moveX")
+        elif CTM_info and dire == "Y":
+            resize = CTM_info.get("resizeY")
+            rotate = CTM_info.get("rotateY")
+            move = CTM_info.get("moveY")
+        else:
+            resize = 1
+            rotate = 0
+            move = 0
+
+        # print(f"resize is {resize}")
+        char_pos = float(pos if pos else 0) + (float(offset if offset else 0) + move) * resize
+        pos_list = []
+        pos_list.append(char_pos)  # 放入第一个字符
+        offsets = [i for i in DeltaRule.split(" ")]
+
+        if "g" in DeltaRule:  # g 代表多个元素
+            g_no = None
+            for _no, offset_i in enumerate(offsets):
+
+                if offset_i == "g":
+                    g_no = _no
+                    for j in range(int(offsets[(g_no + 1)])):
+                        char_pos += float(offsets[(g_no + 2)])
+                        pos_list.append(char_pos)
+
+                elif offset_i and offset_i != "g":
+                    if g_no == None:
+                        char_pos += float(offset_i) * resize
+                        pos_list.append(char_pos)
+                    elif (int(_no) > int(g_no + 2)) and g_no != None:
+                        # print(f"offset_i is {offset_i}")
+                        char_pos += float(offset_i) * resize
+                        pos_list.append(char_pos)
+
+        elif not DeltaRule:  # 没有字符偏移量 一般单字符
+            pos_list = []
+            for i in range(len(text)):
+                pos_list.append(char_pos)
+        else:  # 有字符偏移量
+            for i in offsets:
+                if not i:
+                    char_pos += 0
+                else:
+                    char_pos += float(i) * resize
+                pos_list.append(char_pos)
+
+        return pos_list
+
+    def draw_chars_old(self, canvas, text_list, fonts, page_size):
+        """写入字符"""
+        c = canvas
+        for line_dict in text_list:
+            # TODO 写入前对于正文内容整体序列化一次 方便 查看最后输入值 对于最终 格式先
+            text = line_dict.get("text")
+            # font_info = fonts.get(line_dict.get("font"), {})
+            # if font_info:
+            #     font_name = font_info.get("FontName", "")
+            # else:
+            #     font_name = self.init_font
+            # print(f"font_name:{font_name}")
+
+            # TODO 判断是否通用已有字体 否则匹配相近字体使用
+            # if font_name not in self.font_tool.FONTS:
+            #     font_name = self.font_tool.FONTS[0]
+            font_name = self.init_font
+
+            font = self.font_tool.normalize_font_name(font_name)
+            # print(f"font_name:{font_name} font:{font}")
+
+            try:
+                c.setFont(font, line_dict["size"] * self.OP)
+            except KeyError as key_error:
+                logger.error(f"{key_error}")
+                font = self.font_tool.FONTS[0]
+                c.setFont(font, line_dict["size"] * self.OP)
+            # 原点在页面的左下角 
+            color = line_dict.get("color", [0, 0, 0])
+            if len(color) < 3:
+                color = [0, 0, 0]
+
+            c.setFillColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
+            c.setStrokeColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
+
+            DeltaX = line_dict.get("DeltaX", "")
+            DeltaY = line_dict.get("DeltaY", "")
+            # print("DeltaX",DeltaX)
+            X = line_dict.get("X", "")
+            Y = line_dict.get("Y", "")
+            CTM = line_dict.get("CTM", "")  # 因为ofd 增加这个字符缩放
+            resizeX = 1
+            resizeY = 1
+            # CTM =None # 有的数据不使用这个CTM
+            CTMS = None
+            if CTM:
+                CTMS = CTM.split(" ")
+
+            if CTM and CTMS and len(CTMS) == 6:
+                CTM_info = {
+                    "resizeX": float(CTMS[0]),
+                    "rotateX": float(CTMS[1]),
+                    "rotateY": float(CTMS[2]),
+                    "resizeY": float(CTMS[3]),
+                    "moveX": float(CTMS[4]),
+                    "moveY": float(CTMS[5]),
+
+                }
+
+            else:
+                CTM_info = {}
+            x_list = self.cmp_offset(line_dict.get("pos")[0], X, DeltaX, text, CTM_info, dire="X")
+            y_list = self.cmp_offset(line_dict.get("pos")[1], Y, DeltaY, text, CTM_info, dire="Y")
+
+            # print("x_list",x_list)
+            # print("y_list",y_list)
+            # print("Y",page_size[3])
+            # print("x",page_size[2])
+            # if line_dict.get("Glyphs_d") and  FontFilePath.get(line_dict["font"])  and font_f not in FONTS:
+            if False:  # 对于自定义字体 写入字形 drawPath 性能差暂时作废
+                Glyphs = [int(i) for i in line_dict.get("Glyphs_d").get("Glyphs").split(" ")]
+                for idx, Glyph_id in enumerate(Glyphs):
+                    _cahr_x = float(x_list[idx]) * self.OP
+                    _cahr_y = (float(page_size[3]) - (float(y_list[idx]))) * self.OP
+                    imageFile = draw_Glyph(FontFilePath.get(line_dict["font"]), Glyph_id, text[idx])
+
+                    # font_img_info.append((FontFilePath.get(line_dict["font"]), Glyph_id,text[idx],_cahr_x,_cahr_y,-line_dict["size"]*Op*2,line_dict["size"]*Op*2))
+                    c.drawImage(imageFile, _cahr_x, _cahr_y, -line_dict["size"] * self.OP * 2,
+                                line_dict["size"] * self.OP * 2)
+            else:
+                if len(text) > len(x_list) or len(text) > len(y_list):
+                    text = re.sub("[^\u4e00-\u9fa5]", "", text)
+                try:
+                    # 按行写入  最后一个字符y  算出来大于 y轴  最后一个字符x  算出来大于 x轴 
+                    if y_list[-1] * self.OP > page_size[3] * self.OP or x_list[-1] * self.OP > page_size[2] * self.OP or \
+                            x_list[-1] < 0 or y_list[-1] < 0:
+                        # if True:
+                        # print("line wtite")
+                        x_p = abs(float(X)) * self.OP
+                        y_p = abs(float(page_size[3]) - (float(Y))) * self.OP
+                        print('text, x_p, y_p', text, x_p, y_p)
+                        c.drawString(x_p, y_p, text, mode=0)  # mode=3 文字不可见 0可見
+
+                        # text_write.append((x_p,  y_p, text))
+                    # 按字符写入
+                    else:
+                        for char_id, _char in enumerate(text):
+                            if len(x_list) > char_id:
+                                # print("char wtite")
+                                font_size = line_dict["size"] * self.OP * resizeX
+                                c.setFont(font, line_dict["size"] * self.OP * resizeX)
+                                _char_x = float(x_list[char_id]) * self.OP
+                                _char_y = (float(page_size[3]) - (float(y_list[char_id]))) * self.OP
+                                # print(_cahr_x,  _cahr_y, _cahr_)
+                                print('_cahr_, _char_x, _char_y', _char, _char_x, _char_y, font_size)
+                                c.drawString(_char_x, _char_y, _char, mode=0)  # mode=3 文字不可见 0可見
+                                break
+                            else:
+                                pass
+                                # logger.debug(f"match {_cahr_} pos error \n{text} \n{x_list}")
+                            # text_write.append((_cahr_x,  _cahr_y, _cahr_))
+                        break
+                except Exception as e:
+                    logger.error(f"{e}")
+                    traceback.print_exc()
+
+    def draw_chars(self, canvas, text_list, fonts, page_size, pdf_page_size):
+        """写入字符"""
+        for line_dict in text_list:
+            # TODO 写入前对于正文内容整体序列化一次 方便 查看最后输入值 对于最终 格式先
+            # print('line_dict', line_dict)
+            text = line_dict.get("text")
+            text_size = line_dict.get("size")
+            if not text_size:
+                print('draw_chars not text_size', text)
+                return
+
+            # 变换矩阵
+            ctm = line_dict.get("CTM", '')
+            ctm = self.get_ctm(ctm)
+            a, b, c, d, e, f = ctm
+            # 计算水平和垂直方向的缩放因子的平均值
+            font_scale = (a + d) / 2
+
+            color = line_dict.get("color", [0, 0, 0])
+            if len(color) < 3:
+                color = [0, 0, 0]
+            canvas.setFillColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
+            # c.setStrokeColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
+
+            # 文本框范围
+            boundary = line_dict.get("pos")
+            if len(boundary) != 4:
+                print('draw_chars not boundary', text, boundary)
+                return
+            left, top, width, height = boundary
+
+            # 根据delta_x判断有重复文本
+            delta_x = line_dict.get("DeltaX", "")
+            delta_y = line_dict.get("DeltaY", "")
+            g_cnt = re.findall('g', delta_x)
+            if len(g_cnt) >= 2:
+                split_index = len(text) / 2
+                if text[:int(split_index)] == text[int(split_index):]:
+                    text2 = text[:int(split_index)]
+                    print('len(g_cnt) >= 2', g_cnt, text, '->', text2)
+                    text = text2
+
+            # 文字相对与boundary的偏移
+            x = line_dict.get("X", "")
+            y = line_dict.get("Y", "")
+            if "" in [x, y]:
+                print('draw_chars not x or not y', text, x, y)
+                return
+            x, y = float(x) * font_scale, float(y) * font_scale
+
+            font_name = self.init_font
+            font = self.font_tool.normalize_font_name(font_name)
+
+            # boundary, x, y 计算实际坐标
+            actual_left = left + x
+            actual_right = actual_left + width
+            actual_top = top + y
+            actual_bottom = actual_top + y
+
+            # print('actual_left, actual_top', text, actual_left, actual_top)
+
+            # ctm, text_size 计算字体大小
+            actual_size = text_size * font_scale
+
+            canvas.setFont(font, actual_size * self.OP)
+
+            # print('actual_bottom, y', actual_bottom, y)
+            # ofd原点在左上角,pdf原点在左下角
+            try:
+                # print('text111', text, actual_left * self.OP, pdf_page_size[3] - actual_bottom * self.OP)
+                # 按行写入
+                canvas.drawString(actual_left * self.OP,
+                                  pdf_page_size[3] - actual_top * self.OP,
+                                  text, mode=0)
+
+            except Exception as e:
+                logger.error(f"{e}")
+                traceback.print_exc()
+
+    def draw_odf_char_on_image(self, line_dict, img, pos, ofd_page_size):
+        text = line_dict.get("text")
+        text_size = line_dict.get("size")
+        if not text_size:
+            print('get_odf_char_info not text_size', text)
+            return
+
+        # 变换矩阵
+        ctm = line_dict.get("CTM", '')
+        ctm = self.get_ctm(ctm)
+        a, b, c, d, e, f = ctm
+        # 计算水平和垂直方向的缩放因子的平均值
+        font_scale = (a + d) / 2
+
+        color = line_dict.get("color", [0, 0, 0])
+        if len(color) < 3:
+            color = (0, 0, 0)
+        else:
+            color = tuple([int(x) for x in color])
+        # print('color', color)
+
+        # 文本框范围
+        boundary = line_dict.get("pos")
+        if len(boundary) != 4:
+            print('get_odf_char_info not boundary', text, boundary)
+            return
+        left, top, width, height = boundary
+
+        # 文字相对与boundary的偏移,y小于size的话会显示不完全
+        x = line_dict.get("X", "")
+        y = line_dict.get("Y", "")
+        # print('x, y', x, y)
+        if "" in [x, y]:
+            print('get_odf_char_info not x or not y', text, x, y)
+            return
+        x, y = float(x) * a, float(y) * d
+
+        # boundary, x, y 计算实际坐标
+        actual_left = left
+        actual_right = actual_left + x
+        actual_top = top
+        actual_bottom = actual_top + y
+
+        # print('actual_left', actual_left, ofd_page_size[2], pos[2])
+        # actual_left = actual_left / ofd_page_size[2] * pos[2]
+        # print('actual_left2', actual_left, ofd_page_size[2], pos[2])
+        # actual_top = actual_top / ofd_page_size[3] * pos[3]
+
+        # actual_bottom = bottom + y
+        # actual_top = actual_bottom + y
+
+        # print('actual_left, actual_top', text, actual_left, actual_top)
+
+        # ctm, text_size 计算字体大小
+        actual_size = text_size * font_scale
+        actual_size = int(actual_size * img.size[0] / pos[2])
+
+        left_top_point = [actual_left * img.size[0] / pos[2], actual_top * img.size[1] / pos[3]]
+        left_top_point = [int(x) for x in left_top_point]
+        draw = ImageDraw.Draw(img)
+        font = ImageFont.truetype(os.path.dirname(__file__) + '/simsun.ttc', actual_size)
+
+        # print('text left_top_point, actual_size', text, left_top_point, actual_size)
+        # print('img.size', img.size)
+
+        draw.text(left_top_point, text, font=font, fill=color)
+        return img
+
+    def compute_ctm(self, CTM, x1, y1, img_width, img_height):
+        """待定方法"""
+        a, b, c, d, e, f = CTM.split(" ")
+        a, b, c, d, e, f = float(a), float(b), float(c), float(d), float(e), float(f)
+        # 定义变换矩阵的元素
+
+        # 计算原始矩形的宽和高
+        x2 = x1 + img_width
+        y2 = y1 + img_height
+        print(f"ori x1 {x1} y1 {y1} x2 {x2} y2 {y2} img_width {img_width} img_height {img_height}")
+        a = a / 10
+        d = d / 10
+        # 对左上角和右下角点进行变换
+        x1_new = a * x1 + c * y1 + (e)
+        y1_new = b * x1 + d * y1 + (f)
+        x2_new = a * x2 + c * y2 + (e)
+        y2_new = b * x2 + d * y2 + (f)
+        print(f"x1_new {x1_new} y1_new {y1_new} x2_new {x2_new} y2_new {y2_new}")
+        # 计算变换后矩形的宽和高
+        w_new = x2_new - x1_new
+        h_new = y2_new - y1_new
+
+        print(f"原始矩形宽度: {img_width}, 高度: {img_height}")
+        print(f"变换后矩形宽度: {w_new}, 高度: {h_new}")
+        return x1_new, y1_new, w_new, h_new
+
+    def get_ctm(self, ctm):
+        default_ctm = (1, 0, 0, 1, 0, 0)
+        if not ctm:
+            # print('get_ctm no ctm!', ctm)
+            return default_ctm
+        ctm = ctm.split(" ")
+        if len(ctm) != 6:
+            print('get_ctm len(ctm) != 6', ctm)
+            return default_ctm
+        ctm = [float(x) for x in ctm]
+        # a, b, c, d, e, f = ctm
+        return ctm
+
+    def draw_img_old(self, canvas, img_list, images, page_size):
+        """写入图片"""
+        c = canvas
+        for img_d in img_list:
+            image = images.get(img_d["ResourceID"])
+
+            if not image or image.get("suffix").upper() not in self.SupportImgType:
+                continue
+
+            imgbyte = base64.b64decode(image.get('imgb64'))
+            if not imgbyte:
+                logger.error(f"{image['fileName']} is null")
+                continue
+
+            img = PILImage.open(BytesIO(imgbyte))
+            img_width, img_height = img.size
+            # img_width = img_width / self.OP *25.4
+            # img_height = img_height / self.OP *25.4
+            info = img.info
+            # print( f"ing info dpi {info.get('dpi')}")
+            # print(img_width, img_height)
+            imgReade = ImageReader(img)
+            CTM = img_d.get('CTM')
+            # print("CTM", CTM)
+
+            wrap_pos = image.get("wrap_pos")
+            # print("wrap_pos", wrap_pos)
+            pos = img_d.get('pos')
+            # print("pos", pos)
+            CTM = None
+            if CTM and not wrap_pos and page_size == pos:
+                x1_new, y1_new, w_new, h_new = self.compute_ctm(CTM, 0, 0, img_width, img_height)
+                pdf_pos = [pos[0] * self.OP, pos[1] * self.OP, pos[2] * self.OP, pos[3] * self.OP]
+                print(f"pos: {pos} pdf_pos: {pdf_pos}")
+
+                x1_new = (pos[0] + x1_new) * self.OP
+                y1_new = (page_size[3] - y1_new) * self.OP
+                if w_new > pdf_pos[2]:
+                    w_new = pdf_pos[2]
+                if h_new > pdf_pos[3]:
+                    h_new = pdf_pos[3]
+                print(f"写入 {x1_new} {y1_new} {w_new} {-h_new}")
+                c.drawImage(imgReade, x1_new, y1_new, w_new, -h_new, 'auto')
+            else:
+                x_offset = 0
+                y_offset = 0
+
+                x = (pos[0] + x_offset) * self.OP
+                y = (page_size[3] - (pos[1] + y_offset)) * self.OP
+                if wrap_pos:
+                    x = x + (wrap_pos[0] * self.OP)
+                    y = y - (wrap_pos[1] * self.OP)
+                    w = img_d.get('pos')[2] * self.OP
+                    h = -img_d.get('pos')[3] * self.OP
+
+                    # print(x, y, w, h)
+                    c.drawImage(imgReade, x, y, w, h, 'auto')
+                elif pos:
+                    # print(f"page_size == pos :{page_size == pos} ")
+                    x = pos[0] * self.OP
+                    y = (page_size[3] - pos[1]) * self.OP
+                    w = pos[2] * self.OP
+                    h = -pos[3] * self.OP
+
+                    # print("pos",pos[0],pos[1],pos[2]* self.OP,pos[3]* self.OP)
+                    # print(x2_new, -y2_new, w_new, h_new,)
+
+                    x, y = 0, 0
+                    w, h = img.size
+
+                    print('x, y, w, h', x, y, w, h)
+
+                    c.drawImage(imgReade, x, y, w, h, 'auto')
+                    # c.drawImage(imgReade,x2_new, -y2_new, w_new, h_new, 'auto')
+
+    def draw_img(self, canvas, img_list, images, ofd_page_size, pdf_page_size, ofd_to_pdf_ratio):
+        """写入图片"""
+        c = canvas
+        for img_d in img_list:
+            image = images.get(img_d["ResourceID"])
+            if not image or image.get("suffix").upper() not in self.SupportImgType:
+                print('img_d["ResourceID"]', img_d["ResourceID"])
+                logger.error(f"not image")
+                continue
+
+            imgbyte = base64.b64decode(image.get('imgb64'))
+            if not imgbyte:
+                logger.error(f"{image['fileName']} is null")
+                continue
+
+            img = PILImage.open(BytesIO(imgbyte))
+            info = img.info
+            # print( f"ing info dpi {info.get('dpi')}")
+            ctm = img_d.get('CTM')
+            # print("ctm", ctm)
+            pos = img_d.get('pos')
+            pdf_pos = [x * ofd_to_pdf_ratio for x in pos]
+            # print('pos', pos)
+            # print('pdf_pos', pdf_pos)
+            # print('ofd_page_size', ofd_page_size)
+            # print('pdf_page_size', pdf_page_size)
+            if pos:
+                if pos[2] <= 0.1 or pos[3] <= 0.1:
+                    print('pos[2] <= 0.1 or pos[3] <= 0.1')
+                    continue
+                x, y = pdf_pos[0], pdf_page_size[3] - pdf_pos[1] - pdf_pos[3]
+                w, h = img.size
+                ctm = ctm.split(' ')
+                ctm = [float(x) for x in ctm]
+                a, b, d, e, f, g = ctm
+                if b == 0 and d == 0:
+                    angle_deg = 0
+                else:
+                    # 计算旋转角度,考虑可能的镜像翻转
+                    angle_rad = math.atan2(b, a)
+                    angle_deg = math.degrees(angle_rad)
+                    # 调整角度到 0 到 360 度范围内
+                    angle_deg = angle_deg % 360
+                img = img.rotate(-angle_deg, expand=1)
+                img = img.resize((int(pdf_pos[2]), int(pdf_pos[3])), Image.BICUBIC)
+                img = image_resize_by_ratio(img, int(pdf_page_size[2]), int(pdf_page_size[3]))
+                # img = img.resize((int(pdf_page_size[2]), int(pdf_page_size[3])), Image.BICUBIC)
+                # img = img.rotate(180, expand=1)
+                w, h = img.size
+                # print('jb2 angle_deg, x, y, w, h', angle_deg, x, y, w, h)
+                if img.mode == 'P':
+                    img = img.convert('RGBA')
+                imgReade = ImageReader(img)
+                # print('img.size, x, y, w, h, img.mode', img.size, x, y, w, h, img.mode)
+                c.drawImage(imgReade, x, y, w, h, 'auto')
+
+    def draw_img_with_annot(self, canvas, img_list, images, annot_page_size, pdf_page_size, ofd_to_pdf_ratio, annot_page_info):
+        """写入图片"""
+        c = canvas
+        for img_d in img_list:
+            image = images.get(img_d["ResourceID"])
+            if not image or image.get("suffix").upper() not in self.SupportImgType:
+                print('img_d["ResourceID"]', img_d["ResourceID"])
+                logger.error(f"not image")
+                continue
+
+            imgbyte = base64.b64decode(image.get('imgb64'))
+            if not imgbyte:
+                logger.error(f"{image['fileName']} is null")
+                continue
+
+            img = PILImage.open(BytesIO(imgbyte))
+            ctm = img_d.get('CTM')
+            pos = img_d.get('pos')
+            pdf_pos = [x * ofd_to_pdf_ratio for x in pos]
+            if pos:
+                if pos[2] <= 0.1 or pos[3] <= 0.1:
+                    print('pos[2] <= 0.1 or pos[3] <= 0.1')
+                    continue
+                x, y = pdf_pos[0], pdf_page_size[3] - pdf_pos[1] - pdf_pos[3]
+                w, h = img.size
+                ctm = ctm.split(' ')
+                ctm = [float(x) for x in ctm]
+                a, b, d, e, f, g = ctm
+                if b == 0 and d == 0:
+                    angle_deg = 0
+                else:
+                    # 计算旋转角度,考虑可能的镜像翻转
+                    angle_rad = math.atan2(b, a)
+                    angle_deg = math.degrees(angle_rad)
+                    # 调整角度到 0 到 360 度范围内
+                    angle_deg = angle_deg % 360
+
+                img = img.rotate(-angle_deg, expand=1)
+                print('angle_deg', angle_deg)
+
+                # 画上注释文字
+                # text_list = annot_page_info
+                for text_d in annot_page_info:
+                    # print('text_d', text_d)
+                    # print('img.size', img.size)
+                    print('img pos', pos)
+                    img = self.draw_odf_char_on_image(text_d, img, pos, annot_page_size)
+
+                img = img.resize((int(pdf_pos[2]), int(pdf_pos[3])), Image.BICUBIC)
+                img = image_resize_by_ratio(img, int(pdf_page_size[2]), int(pdf_page_size[3]))
+
+                w, h = img.size
+                if img.mode == 'P':
+                    img = img.convert('RGBA')
+                imgReade = ImageReader(img)
+                c.drawImage(imgReade, x, y, w, h, 'auto')
+
+    def draw_signature(self, canvas, signatures_page_list, page_size):
+        """
+        写入签章
+            {
+            "sing_page_no": sing_page_no,
+            "PageRef": PageRef,
+            "Boundary": Boundary,
+            "SignedValue": self.file_tree(SignedValue),
+                            }
+        """
+        c = canvas
+        try:
+            if signatures_page_list:
+                # print("signatures_page_list",signatures_page_list)
+                for signature_info in signatures_page_list:
+                    image = SealExtract()(b64=signature_info.get("SignedValue"))
+                    if not image:
+                        # logger.info(f"提取不到签章图片")
+                        continue
+                    else:
+                        image_pil = image[0]
+
+                    pos = [float(i) for i in signature_info.get("Boundary").split(" ")]
+
+                    imgReade = ImageReader(image_pil)
+
+                    x = pos[0] * self.OP
+                    y = (page_size[3] - pos[1]) * self.OP
+
+                    w = pos[2] * self.OP
+                    h = -pos[3] * self.OP
+                    c.drawImage(imgReade, x, y, w, h, 'auto')
+                    # print(f"签章写入成功")
+            else:
+                # 无签章
+                pass
+        except Exception as e:
+            # print(f"签章写入失败 {e}")
+            traceback.print_exc()
+
+    def draw_line_old(self, canvas, line_list, page_size):
+        """绘制线条"""
+
+        # print("绘制",line_list)
+
+        def match_mode(Abbr: list):
+            """
+            解析AbbreviatedData
+            匹配各种线条模式
+            S 定义起始 坐标 x, y
+            M 移动到指定坐标 x, y
+            L 从当前点移动到指定点 x, y
+            Q x1 y1 x2 y2 二次贝塞尔曲线
+            B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线
+            A 到 x,y 的圆弧 并移动到 x,y  rx 长轴 ry 短轴 angle 旋转角度 large为1表示 大于180 的弧 为0时表示小于180的弧 swcpp 为1 表示顺时针旋转 0 表示逆时针旋转
+            C 当前点和SubPath自动闭合
+            """
+            relu_list = []
+            mode = ""
+            modes = ["S", "M", "L", "Q", "B", "A", "C"]
+            mode_dict = {}
+            for idx, i in enumerate(Abbr):
+                if i in modes:
+                    mode = i
+                    if mode_dict:
+                        relu_list.append(mode_dict)
+                    mode_dict = {"mode": i, "points": []}
+
+                else:
+                    mode_dict["points"].append(i)
+
+                if idx + 1 == len(Abbr):
+                    relu_list.append(mode_dict)
+            return relu_list
+
+        def assemble(relu_list: list):
+            start_point = {}
+            acticon = []
+            for i in relu_list:
+                if i.get("mode") == "M":
+                    start_point = i
+                elif i.get("mode") in ['B', "Q", 'L']:
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+            return acticon
+
+        def convert_coord(p_list, direction, page_size, pos):
+            """坐标转换ofd2pdf"""
+            new_p_l = []
+            for p in p_list:
+                if direction == "x":
+
+                    new_p = (float(pos[0]) + float(p)) * self.OP
+                else:
+                    new_p = (float(page_size[3]) - float(pos[1]) - float(p)) * self.OP
+                new_p_l.append(new_p)
+            return new_p_l
+
+        for line in line_list:
+            Abbr = line.get("AbbreviatedData").split(" ")  # AbbreviatedData 
+            color = line.get("FillColor", [0, 0, 0])
+
+            relu_list = match_mode(Abbr)
+            # TODO 组合 relu_list 1 M L 直线 2 M B*n 三次贝塞尔线 3 M Q*n 二次贝塞尔线
+
+            # print(relu_list)
+
+            acticons = assemble(relu_list)
+            pos = line.get("pos")
+            # print(color)
+            if len(color) < 3:
+                color = [0, 0, 0]
+            canvas.setStrokeColorRGB(*(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255))  # 颜色
+
+            # 设置线条宽度
+            try:
+                LineWidth = (float(line.get("LineWidth", "0.25").replace(" ", "")) if \
+                                 line.get("LineWidth", "0.25").replace(" ", "") else 0.25) * self.OP
+            except Exception as e:
+                # logger.error(f"{e}")
+                LineWidth = 0.25 * self.OP
+
+            canvas.setLineWidth(LineWidth)  # 单位为点,2 表示 2 点
+
+            for acticon in acticons:
+                if acticon.get("end_point").get("mode") == 'L':  # 直线
+                    x1, y1, x2, y2 = *acticon.get("start_point").get("points"), *acticon.get("end_point").get("points")
+                    x1, x2 = convert_coord([x1, x2], "x", page_size, pos)
+                    y1, y2 = convert_coord([y1, y2], "y", page_size, pos)
+                    # 绘制一条线 x1 y1 x2 y2
+                    canvas.line(x1, y1, x2, y2)
+
+                elif acticon.get("end_point").get("mode") == 'B':  # 三次贝塞尔线
+                    continue
+                    x1, y1, x2, y2, x3, y3, x4, y4 = *acticon.get("start_point").get("points"), *acticon.get(
+                        "end_point").get("points")
+                    x1, x2, x3, x4 = convert_coord([x1, x2, x3, x4], "x", page_size, pos)
+                    y1, y2, y3, y4 = convert_coord([y1, y2, y3, y4], "y", page_size, pos)
+                    # print(x1, y1, x2, y2, x3, y3, x4, y4)
+
+                    # 绘制三次贝塞尔线
+                    canvas.bezier(x1, y1, x2, y2, x3, y3, x4, y4)
+
+                elif acticon.get("end_point").get("mode") == 'Q':  # 二次贝塞尔线
+                    pass
+                else:
+                    continue
+
+    def draw_line_old_250619(self, canvas, line_list, page_size):
+        def match_mode(Abbr: list):
+            """
+            解析AbbreviatedData
+            匹配各种线条模式
+            S 定义起始 坐标 x, y
+            M 移动到指定坐标 x, y
+            L 从当前点移动到指定点 x, y
+            Q x1 y1 x2 y2 二次贝塞尔曲线 从当前点连接一条到点(x2,y2)的二次贝塞尔曲线,并将当前点移动到点(x2,y2),此贝塞尔曲线使用点(x1,y1)作为其控制点。
+            B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线 从当前点连接一条到点(x3,y3)的三次贝塞尔曲线,并将当前点移动到点(x3,y3),此贝塞尔曲线使用点(x1,y1)和点(x2,y2)作为其控制点。
+            A Are 操作数为rx ry angle large sweep x y,从当前点连接一条到点(x,y)的圆弧,并将当前点移动到点(x,y)。
+            其中,rx表示椭圆的长轴长度,ry表示椭圆的短轴长度,angle表示椭圆在当前坐标系下旋转的角度,正值为顺时针,
+            负值为逆时针,large为 1 时表示对应度数大于 180° 的弧,为 0 时表示对应度数小于 180° 的弧,
+            sweep为 1 时表示由圆弧起始点到结束点是顺时针旋转,为 0 时表示由圆弧起始点到结束点是逆时针旋转。
+            C 无操作数,其作用是SubPath自动闭合,表示将当前点和SubPath的起始点用线段直接连接。
+            """
+            relu_list = []
+            mode = ""
+            modes = ["S", "M", "L", "Q", "B", "A", "C"]
+            mode_dict = {}
+            for idx, i in enumerate(Abbr):
+                if i in modes:
+                    mode = i
+                    if mode_dict:
+                        relu_list.append(mode_dict)
+                    mode_dict = {"mode": i, "points": []}
+
+                else:
+                    mode_dict["points"].append(i)
+
+                if idx + 1 == len(Abbr):
+                    relu_list.append(mode_dict)
+            return relu_list
+
+        def assemble(relu_list: list):
+            start_point = {}
+            acticon = []
+
+            for i in relu_list:
+                if i.get("mode") == "M":
+                    if not start_point:
+                        start_point = i
+                    acticon.append({
+                        "start_point": start_point, "end_point": i})
+
+                elif i.get("mode") in ['B', "Q", 'L']:
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+                elif i.get("mode") == "C":
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+                elif i.get("mode") == "A":
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+                elif i.get("mode") == "S":
+                    start_point = i
+
+            return acticon
+
+        def convert_coord(p_list, direction, page_size, pos):
+            """坐标转换ofd2pdf"""
+            new_p_l = []
+            # print("p_list", p_list)
+            for p in p_list:
+                if direction == "x":
+                    new_p = (float(pos[0]) + float(p)) * self.OP
+                else:
+                    new_p = (float(page_size[3]) - float(pos[1]) - float(p)) * self.OP
+                new_p_l.append(new_p)
+            # print("new_p_l", new_p_l)
+            return new_p_l
+
+        for line in line_list:
+            print('one line', "="*20)
+            path = canvas.beginPath()
+            Abbr = line.get("AbbreviatedData").split(" ")  # AbbreviatedData
+            color = line.get("FillColor", [0, 0, 0])
+
+            relu_list = match_mode(Abbr)
+            # TODO 组合 relu_list 1 M L 直线 2 M B*n 三次贝塞尔线 3 M Q*n 二次贝塞尔线
+
+            # print(relu_list)
+
+            acticons = assemble(relu_list)
+            pos = line.get("pos")
+            # print(color)
+            if len(color) < 3:
+                color = [0, 0, 0]
+            canvas.setStrokeColorRGB(*(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255))  # 颜色
+
+            # 设置线条宽度
+            try:
+                LineWidth = (float(line.get("LineWidth", "0.25").replace(" ", "")) if \
+                                 line.get("LineWidth", "0.25").replace(" ", "") else 0.25) * self.OP
+            except Exception as e:
+                logger.error(f"{e}")
+                LineWidth = 0.25 * self.OP
+
+            canvas.setLineWidth(LineWidth)  # 单位为点,2 表示 2 点
+            cur_point = []
+            for acticon in acticons:
+                if acticon.get("end_point").get("mode") == 'M':
+                    x, y = acticon.get("end_point").get("points")
+                    x = convert_coord([x], "x", page_size, pos)[0]
+                    y = convert_coord([y], "y", page_size, pos)[0]
+                    cur_point = [x, y]
+                    path.moveTo(x, y)
+
+                elif acticon.get("end_point").get("mode") == 'L':  # 直线
+                    x, y = acticon.get("end_point").get("points")
+                    print('path L x, y', x, y)
+                    x = convert_coord([x], "x", page_size, pos)[0]
+                    y = convert_coord([y], "y", page_size, pos)[0]
+                    print('path L x, y2', x, y)
+                    path.lineTo(x, y)
+
+
+                elif acticon.get("end_point").get("mode") == 'B':  # 三次贝塞尔线
+                    x1, y1, x2, y2, x3, y3 = acticon.get("end_point").get("points")
+                    # print(x1, y1, x2, y2, x3, y3)
+                    x1, x2, x3 = convert_coord([x1, x2, x3], "x", page_size, pos)
+                    y1, y2, y3 = convert_coord([y1, y2, y3], "y", page_size, pos)
+                    cur_point = [x2, y2]
+                    path.curveTo(x1, y1, x2, y2, x3, y3)
+                    path.moveTo(x3, y3)
+
+                elif acticon.get("end_point").get("mode") == 'Q':  # 二次贝塞尔线
+                    x1, y1, x2, y2 = acticon.get("end_point").get("points")
+                    x1, x2 = convert_coord([x1, x2], "x", page_size, pos)
+                    y1, y2 = convert_coord([y1, y2], "y", page_size, pos)
+                    cur_point = [x2, y2]
+                    path.curveTo(x1, y1, x2, y2, x2, y2)
+                    path.moveTo(x2, y2)
+                elif acticon.get("end_point").get("mode") == 'A':  # 圆弧线
+                    x1, y1 = acticon.get("start_point").get("points")
+                    rx, ry, startAng, large_arc_flag, sweep_flag, x2, y2 = acticon.get("end_point").get("points")
+                    rx_o = rx
+                    ry_o = ry
+
+                    x1, x2, rx = convert_coord([x1, x2, rx], "x", page_size, pos)
+                    y1, y2, ry = convert_coord([y1, y2, ry], "y", page_size, pos)
+
+                    cur_x, cur_y = cur_point
+
+                    # 绘制圆弧 有问题
+                    if rx_o == ry_o:
+                        # path.circle(cur_x,cur_y, 20) # 圆
+                        path.circle(rx, ry, 20)  # 圆 # 莫名其妙的圆
+                    else:
+                        print(rx, ry, x2, y2, startAng, large_arc_flag, sweep_flag)
+                        path.ellipse(rx, ry, 20, 20, )  # 椭圆
+                    # path.arc(rx, ry, x2, y2, startAng=int(startAng), extent=int(sweep_flag))
+                    # path.ellipse(rx, ry,x2, y2, ) # 椭圆
+                    # path.curveTo(rx, ry ,x2, y2, startAng=int(startAng), extent=int(sweep_flag))
+                    path.moveTo(x2, y2)
+                    cur_point = [x2, y2]
+
+                elif acticon.get("end_point").get("mode") == 'C':
+                    # canvas.drawPath(path)
+                    path.close()
+            canvas.drawPath(path)
+
+    def draw_line(self, canvas, line_list, page_size, pdf_page_size):
+        def match_mode(Abbr: list):
+            """
+            解析AbbreviatedData
+            匹配各种线条模式
+            S 定义起始 坐标 x, y
+            M 移动到指定坐标 x, y
+            L 从当前点移动到指定点 x, y
+            Q x1 y1 x2 y2 二次贝塞尔曲线 从当前点连接一条到点(x2,y2)的二次贝塞尔曲线,并将当前点移动到点(x2,y2),此贝塞尔曲线使用点(x1,y1)作为其控制点。
+            B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线 从当前点连接一条到点(x3,y3)的三次贝塞尔曲线,并将当前点移动到点(x3,y3),此贝塞尔曲线使用点(x1,y1)和点(x2,y2)作为其控制点。
+            A Are 操作数为rx ry angle large sweep x y,从当前点连接一条到点(x,y)的圆弧,并将当前点移动到点(x,y)。
+            其中,rx表示椭圆的长轴长度,ry表示椭圆的短轴长度,angle表示椭圆在当前坐标系下旋转的角度,正值为顺时针,
+            负值为逆时针,large为 1 时表示对应度数大于 180° 的弧,为 0 时表示对应度数小于 180° 的弧,
+            sweep为 1 时表示由圆弧起始点到结束点是顺时针旋转,为 0 时表示由圆弧起始点到结束点是逆时针旋转。
+            C 无操作数,其作用是SubPath自动闭合,表示将当前点和SubPath的起始点用线段直接连接。
+            """
+            relu_list = []
+            mode = ""
+            modes = ["S", "M", "L", "Q", "B", "A", "C"]
+            mode_dict = {}
+            for idx, i in enumerate(Abbr):
+                if i in modes:
+                    mode = i
+                    if mode_dict:
+                        relu_list.append(mode_dict)
+                    mode_dict = {"mode": i, "points": []}
+
+                else:
+                    mode_dict["points"].append(i)
+
+                if idx + 1 == len(Abbr):
+                    relu_list.append(mode_dict)
+            return relu_list
+
+        def assemble(relu_list: list):
+            start_point = {}
+            acticon = []
+
+            for i in relu_list:
+                if i.get("mode") == "M":
+                    if not start_point:
+                        start_point = i
+                    acticon.append({
+                        "start_point": start_point, "end_point": i})
+
+                elif i.get("mode") in ['B', "Q", 'L']:
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+                elif i.get("mode") == "C":
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+                elif i.get("mode") == "A":
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+                elif i.get("mode") == "S":
+                    start_point = i
+
+            return acticon
+
+        for line in line_list:
+            # print('one line', "="*20)
+            path = canvas.beginPath()
+            abbr = line.get("AbbreviatedData").split(" ")
+            color = line.get("FillColor", [0, 0, 0])
+
+            # 线条解析
+            relu_list = match_mode(abbr)
+            actions = assemble(relu_list)
+
+            # 变换矩阵
+            ctm = line.get("CTM", '')
+            ctm = self.get_ctm(ctm)
+
+            # 文本框范围
+            boundary = line.get("pos")
+            if len(boundary) != 4:
+                print('draw_line not boundary', boundary)
+                return
+
+            # 设置颜色
+            if len(color) < 3:
+                color = [0, 0, 0]
+            canvas.setStrokeColorRGB(*(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255))  # 颜色
+
+            # 设置线条宽度
+            line_w = 0.20 * self.OP
+            canvas.setLineWidth(line_w)
+
+            for action in actions:
+                if action.get("end_point").get("mode") == 'M':
+                    x, y = action.get("end_point").get("points")
+                    # print('path M x, y', x, y)
+                    x, y = self.get_actural_p(x, y, ctm, boundary)
+                    x = x * self.OP
+                    y = pdf_page_size[3] - y * self.OP
+                    # print('path M x, y2', x, y)
+                    path.moveTo(x, y)
+
+                elif action.get("end_point").get("mode") == 'L':  # 直线
+                    x, y = action.get("end_point").get("points")
+                    # print('path L x, y', x, y)
+                    x, y = self.get_actural_p(x, y, ctm, boundary)
+                    # print('path L x, y1', x, y)
+                    x = x * self.OP
+                    y = pdf_page_size[3] - y * self.OP
+                    # print('path L x, y2', x, y)
+                    path.lineTo(x, y)
+
+                elif action.get("end_point").get("mode") == 'C':
+                    path.close()
+            canvas.drawPath(path)
+
+    def get_actural_p(self, x, y, ctm, boundary):
+        x, y = float(x), float(y)
+        a, b, c, d, e, f = ctm
+        left, bottom, width, height = boundary
+        # print('left, x, a', left, x, a, type(left), type(x), type(a))
+        x = left + x * a
+        y2 = bottom + y * d
+        y1 = y2 + height
+        return x, y2
+
+    def draw_pdf(self):
+        c = canvas.Canvas(self.pdf_io)
+        c.setAuthor(self.author)
+        page_need_to_image_dict = {}
+        for doc_id, doc in enumerate(self.data, start=0):
+            # print(1)
+            fonts = doc.get("fonts")
+            images = doc.get("images")
+            default_page_size = doc.get("default_page_size")
+            page_size_details = doc.get("page_size")
+            # print("page_size_details", page_size_details)
+            signatures_page_id = doc.get("signatures_page_id")  # 签证信息
+            # annot_page_info = doc.get("annot_page_info")
+
+            # 注册字体
+            # for font_id, font_v in fonts.items():
+            #     file_name = font_v.get("FontFile")
+            #     font_b64 = font_v.get("font_b64")
+            #     if font_b64:
+            #         self.font_tool.register_font(os.path.split(file_name)[1], font_v.get("@FontName"), font_b64)
+
+            # 判断页数是否匹配
+            if len(doc.get("page_info")) != len(page_size_details):
+                print('len(doc.get("page_info")) != len(page_size_details)')
+                continue
+
+            page_id_list = list(doc.get("page_info").keys())
+            try:
+                page_id_list.sort(key=lambda x: int(x))
+            except:
+                traceback.print_exc()
+                print('sort page_id_list error!', page_id_list)
+                continue
+
+            # text_img_idwrite = []
+            # print("doc.get(page_info)", len(doc.get("page_info")))
+            for pi, page_id in enumerate(page_id_list):
+                page = doc.get("page_info").get(page_id)
+                annot_text_list = doc.get("page_info").get(page_id).get('annot_text_list')
+                # print('page111', page)
+                # print(f"page_id: {page_id} page_size_details: {page_size_details}")
+                # if len(page_size_details) > page_id and page_size_details[page_id]:
+                #     page_size = page_size_details[page_id]
+                # else:
+                #     page_size = default_page_size
+                page_size = page_size_details[pi]
+                # logger.info(f"page_id {page_id} page_size {page_size}")
+                text_list = page.get("text_list")
+                img_list = page.get("img_list")
+                line_list = page.get("line_list")
+                # print("img_list",img_list)
+                # print('page_size222', page_size)
+                c.setPageSize((page_size[2] * self.OP, page_size[3] * self.OP))
+                pdf_page_size = [x * self.OP for x in page_size]
+
+                # print('len(img_list), len(images), len(text_list), len(line_list)', len(img_list), len(images), len(text_list), len(line_list))
+
+                # 写入图片
+                # print('annot_text_list', annot_text_list)
+                # if img_list and annot_text_list:
+                #     annot_page_size = doc.get("page_info").get(page_id).get('annot_page_size')
+                #     print('annot_page_size111', annot_page_size)
+                #     self.draw_img_with_annot(c, img_list, images, annot_page_size, pdf_page_size, self.OP, annot_text_list)
+
+                if img_list and annot_text_list:
+                    page_need_to_image_dict[pi] = True
+                else:
+                    page_need_to_image_dict[pi] = False
+                if img_list:
+                    self.draw_img(c, img_list, images, page_size, pdf_page_size, self.OP)
+
+                # 写入文本
+                if text_list:
+                    # 特殊中文转为基本中文
+                    for line_dict in text_list:
+                        text = line_dict.get("text")
+                        line_dict['text'] = special_font_to_normal(text)
+                        # print('draw_chars, text', text, line_dict.get('pos'))
+                    self.draw_chars(c, text_list, fonts, page_size, pdf_page_size)
+
+                # 绘制线条
+                if line_list:
+                    # for line in line_list:
+                    #     print('line', line)
+                    self.draw_line(c, line_list, page_size, pdf_page_size)
+
+                # 绘制签章
+                # if signatures_page_id:
+                #     self.draw_signature(c, signatures_page_id.get(page_id), page_size)
+
+                # print("去写入")
+                # print(doc_id,len(self.data))
+                # 页码判断逻辑
+                # if page_id != len(doc.get("page_info")) - 1 and doc_id != len(self.data):
+                #     # print("写入")
+                #     c.showPage()
+                    # json.dump(text_write,open("text_write.json","w",encoding="utf-8"),ensure_ascii=False)
+                c.showPage()
+        c.save()
+        return page_need_to_image_dict
+
+    def __call__(self):
+        try:
+            page_need_to_image_dict = self.draw_pdf()
+            self.page_need_to_image_dict = page_need_to_image_dict
+            pdfbytes = self.pdf_io.getvalue()
+        except Exception as e:
+            logger.error(f"{e}")
+            logger.error(f"ofd解析失败")
+            traceback.print_exc()
+            self.gen_empty_pdf()
+            self.page_need_to_image_dict = {}
+            pdfbytes = self.pdf_io.getvalue()
+        return pdfbytes
+
+
+

+ 113 - 0
format_convert/easyofd/easyofd/draw/find_seal_img.py

@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: easyofd read_seal_img
+# CREATE_TIME: 2024/5/28 14:13
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: renoyuan
+# note: 根据 ASN.1 解析签章 拿到 签章图片
+import io
+import base64
+
+from PIL import Image, UnidentifiedImageError
+from loguru import logger
+from pyasn1.codec.der.decoder import decode
+from pyasn1.type import univ
+from pyasn1.error import PyAsn1Error
+
+
+
+class SealExtract(object):
+    def __init__(self,):
+        pass
+    def read_signed_value(self, path="", b64=""):
+        # 读取二进制文件
+        if b64:
+            binary_data = base64.b64decode(b64)
+        elif path:
+            # print("seal_path",path)
+            with open(path, 'rb') as file:
+                binary_data = file.read()
+        else:
+            return
+        # 尝试解码为通用的 ASN.1 结构
+        try:
+            decoded_data, _ = decode(binary_data)
+        except (PyAsn1Error,) as e:
+            logger.warning(f"Decoding failed: {e}")
+            decoded_data = None
+        except (AttributeError,) as e:
+            logger.warning(f"AttributeError failed: {e}")
+            decoded_data = None
+        finally:
+           return  decoded_data
+
+
+    def find_octet_strings(self, asn1_data,octet_strings:list):
+
+        # 递归查找所有的 OctetString 实例
+
+        if isinstance(asn1_data, univ.OctetString):
+
+            octet_strings.append(asn1_data)
+        elif isinstance(asn1_data, univ.Sequence) or isinstance(asn1_data, univ.Set):
+            for component in asn1_data:
+                self.find_octet_strings(asn1_data[f"{component}"], octet_strings)
+        elif isinstance(asn1_data, univ.Choice):
+            self.find_octet_strings(asn1_data.getComponent(), octet_strings)
+        elif isinstance(asn1_data, univ.Any):
+            try:
+                sub_data, _ = decode(asn1_data.asOctets())
+                self.find_octet_strings(sub_data, octet_strings)
+            except PyAsn1Error:
+                pass
+
+
+    def hex_to_image(self, hex_data, image_format='PNG',inx=0):
+        """
+        将16进制数据转换为图片并保存。
+
+        :param hex_data: 图片的16进制数据字符串
+        :param image_format: 图片的格式,默认为'PNG'
+        """
+        # 将16进制数据转换为二进制数据
+
+        binary_data = bytes.fromhex(hex_data)
+
+        # 创建BytesIO对象以读取二进制数据
+        image_stream = io.BytesIO(binary_data)
+
+        # 使用Pillow打开图像数据并保存
+        try:
+            image = Image.open(image_stream)
+            # image.save(f'{inx}_image.{image_format}', format=image_format)
+            # print(f"图片已保存为'image.{image_format}'")
+            return image
+        except UnidentifiedImageError:
+            # logger.info("not img ")
+            pass
+
+    def __call__(self, path="", b64=""):
+
+        decoded_data = self.read_signed_value(path=path, b64=b64)
+        octet_strings = []
+        img_list = []  # 目前是只有一个的,若存在多个的话关联后面考虑
+        if decoded_data:
+            self.find_octet_strings(decoded_data, octet_strings)
+
+            for i, octet_string in enumerate(octet_strings):
+                # logger.info(f"octet_string{octet_string}")
+                if str(octet_string.prettyPrint()).startswith("0x"):
+
+                    img = self.hex_to_image(str(octet_string.prettyPrint())[2:],inx= i)
+                    if img:
+                        # logger.info("ASN.1 data found.")
+                        img_list.append(img)
+        else:
+            pass
+            # logger.info("No valid ASN.1 data found.")
+
+        return  img_list
+
+if __name__=="__main__":
+    print(SealExtract()(r"F:\code\easyofd\test\1111_xml\Doc_0\Signs\Sign_0\SignedValue.dat" ))
+

+ 216 - 0
format_convert/easyofd/easyofd/draw/font_tools.py

@@ -0,0 +1,216 @@
+#!/usr/bin/env python
+#-*- coding: utf-8 -*-
+#PROJECT_NAME: D:\code\easyofd\easyofd
+#CREATE_TIME: 2023-07-27 
+#E_MAIL: renoyuan@foxmail.com
+#AUTHOR: reno 
+#NOTE: 字体处理
+import sys
+import time
+import re
+import json
+import base64
+import zipfile
+import os
+import shutil
+import logging
+from io import BytesIO, StringIO
+import string
+from uuid import uuid1
+import random
+import traceback
+import logging
+
+
+import tempfile
+import xmltodict
+from fontTools.ttLib import TTFont as ttLib_TTFont
+from fontTools.pens.basePen import BasePen
+from reportlab.graphics.shapes import Path
+from reportlab.lib import colors
+from reportlab.graphics import renderPM
+from reportlab.graphics.shapes import Group, Drawing, scale
+from reportlab import platypus
+from reportlab.lib.pagesizes import letter, A4
+from reportlab.lib.units import mm,inch
+from reportlab.platypus import SimpleDocTemplate, Image
+from reportlab.lib.utils import ImageReader
+from reportlab.pdfgen import canvas
+from reportlab.pdfbase import pdfmetrics
+from reportlab.pdfbase.cidfonts import UnicodeCIDFont
+from reportlab.pdfbase.ttfonts import TTFont
+from concurrent.futures import ThreadPoolExecutor
+import threading
+import multiprocessing
+import PIL
+
+
+from reportlab.lib.fonts import _tt2ps_map 
+from reportlab.lib.fonts import _family_alias
+
+
+sys.path.append(os.path.dirname(__file__) + "/../../../../")
+
+from format_convert.easyofd.easyofd.draw import FONTS
+
+from loguru import logger
+
+
+
+class FontTool(object):
+    FONTS = FONTS
+    def __init__(self):
+        # 初始支持字体
+        # 字体检测
+        # logger.debug("FontTool init ,read system default Font ... ")
+        self.FONTS = self.get_installed_fonts()
+        # logger.debug(f"system default Font is \n{self.FONTS} \n{'-'*50}")
+
+
+    def get_system_font_dirs(self,):
+        """获取不同操作系统的字体目录"""
+        system = os.name
+        if system == 'nt':  # Windows
+            return [os.path.join(os.environ['WINDIR'], 'Fonts')]
+        elif system == 'posix':  # Linux/macOS
+            return [
+                '/usr/share/fonts',
+                '/usr/local/share/fonts',
+                os.path.expanduser('~/.fonts'),
+                os.path.expanduser('~/.local/share/fonts'),
+                '/Library/Fonts',  # macOS
+                '/System/Library/Fonts'  # macOS
+            ]
+        else:
+            return []
+
+    def normalize_font_name(self, font_name):
+        """将字体名称规范化,例如 'Times New Roman Bold' -> 'TimesNewRoman-Bold'"""
+        # 替换空格为无,并将样式(Bold/Italic等)用连字符连接
+        normalized = font_name.replace(' ', '')
+        # 处理常见的样式后缀
+        for style in ['Bold', 'Italic', 'Regular', 'Light', 'Medium', ]:
+            if style in normalized:
+                normalized = normalized.replace(style, f'-{style}')
+
+        # todo 特殊字体名规范 后续存在需要完善
+        if normalized ==  "TimesNewRoman" :
+            normalized = normalized.replace("TimesNewRoman","Times-Roman")
+        return normalized
+
+    def _process_ttc_font(self, ttc_font):
+        """处理ttc文件中的所有字体"""
+        def judge_name(name):
+            if 'http://' in name or 'https://' in name or len(name) > 50:
+                return False
+            else:
+                return True
+        font_names = set()
+        try:
+            # 获取所有可用的名称记录
+            name_records = ttc_font['name'].names
+
+            for idx, record in enumerate(name_records):
+                try:
+                    # 尝试获取中文名称(简体中文的language ID是2052)
+                    if record.platformID == 3 and record.langID == 2052:
+                        cn_name = record.toUnicode()
+                        if judge_name(cn_name):
+                            font_names.add(cn_name)
+
+
+
+                    # 回退到英文名称(language ID 1033)
+                    elif record.platformID == 3 and record.langID == 1033:
+                        name = record.toUnicode()
+                        if judge_name(name):
+                            font_names.add(name)
+                except:
+                    continue
+        except KeyError:
+            # 如果name表不存在,跳过
+            pass
+        return font_names
+    def get_installed_fonts(self, ):
+        """获取所有已安装字体的名称和家族"""
+        font_dirs = self.get_system_font_dirs()
+        installed_fonts = set()
+        for font_dir in font_dirs:
+            if not os.path.isdir(font_dir):
+                continue
+            for root, _, files in os.walk(font_dir):
+                for file in files:
+                    if file.lower().endswith(('.ttf', '.otf','.ttc')):
+                        font_path = os.path.join(root, file)
+
+                        try:
+                            if file.lower().endswith('.ttc'):
+                                # 对于ttc文件,读取所有字体
+                                ttc_font = ttLib_TTFont(font_path, fontNumber=0)  # 读取第一个字体
+                                installed_fonts.update(self._process_ttc_font(ttc_font))
+                            else:
+                                with ttLib_TTFont(font_path) as font:
+                                    # 提取字体全名和家族名
+                                    name_cn = font['name'].getName(4, 3, 1, 2052)
+                                    if name_cn:
+                                        installed_fonts.add(name_cn.toUnicode())
+                                    # 4=Full Name, 3=Windows, 1=Unicode
+                                    name = font['name'].getName(4, 3, 1, 1033)
+                                    if name:
+                                        installed_fonts.add(name.toUnicode())
+                                    family_cn = font['name'].getName(1, 3, 1, 2052)
+                                    if family_cn:
+                                        installed_fonts.add(family_cn.toUnicode())
+                                    family = font['name'].getName(1, 3, 1, 1033)
+                                    if family:  # 1=Family Name
+                                        installed_fonts.add(family.toUnicode())
+                        except Exception as e:
+                            print(f"解析字体 {font_path} 失败: {e}")
+        installed_fonts = list(installed_fonts)
+        if "宋体" in installed_fonts:
+            installed_fonts.remove("宋体")
+            installed_fonts.insert(0, "宋体")
+        return installed_fonts
+
+    def is_font_available(self, target_font):
+        """检查目标字体是否安装"""
+        installed_fonts = self.get_installed_fonts()
+        return target_font in installed_fonts
+
+    
+    def font_check(self):
+        pass
+        # logger.info("f{_tt2ps_map}")
+        # logger.info("f{_family_alias}")
+        
+        # for font in self.FONTS:
+        #     if font in _tt2ps_map.values():
+        #         logger.info(f"已注册{font}")
+        #     else:
+        #         logger.warning(f"-{font}-未注册可能导致写入失败")
+                 
+        
+        
+    def register_font(self,file_name,FontName,font_b64):
+        
+        if font_b64:
+            
+            file_name = os.path.split(file_name)
+            # logger.error(f"file_name:{file_name}")
+            # logger.info(f"file_name:{file_name}")
+            if isinstance(file_name, (tuple, list)):
+                    file_name = file_name[1]
+            if not FontName:
+                FontName = file_name.split(".")[0]
+
+            try:
+                with open(file_name, "wb") as f:
+                    f.write(base64.b64decode(font_b64))
+                # print("FontName", FontName, "file_name", file_name)
+                pdfmetrics.registerFont(TTFont(FontName, file_name))
+                self.FONTS.append(FontName)
+            except Exception as e:
+                logger.error(f"register_font_error:\n{e} \n 包含不支持解析字体格式")
+            finally:
+                if os.path.exists(file_name):
+                    os.remove(file_name)

+ 666 - 0
format_convert/easyofd/easyofd/draw/ofdtemplate.py

@@ -0,0 +1,666 @@
+#!/usr/bin/env python
+#-*- coding: utf-8 -*-
+#PROJECT_NAME: F:\code\easyofd\easyofd\draw
+#CREATE_TIME: 2023-10-30 
+#E_MAIL: renoyuan@foxmail.com
+#AUTHOR: reno 
+#note:  ofd 基础结构模板
+import tempfile
+import os
+import abc
+import copy
+
+from loguru import logger
+import xmltodict
+import zipfile
+
+__all__ = ["CurId", "OFDTemplate", "DocumentTemplate", "DocumentResTemplate",
+           "PublicResTemplate", "ContentTemplate", "OFDStructure"]
+"""
+OFD目录结构
+    │  OFD.xml
+    │  
+    └─Doc_0
+        │  Document.xml
+        │  DocumentRes.xml
+        │  PublicRes.xml
+        │  
+        ├─Annots
+        │  │  Annotations.xml
+        │  │  
+        │  └─Page_0
+        │          Annotation.xml
+        │          
+        ├─Attachs
+        │      Attachments.xml
+        │      original_invoice.xml
+        │      
+        ├─Pages
+        │  └─Page_0
+        │          Content.xml
+        │          
+        ├─Res
+        │      image_80.jb2
+        │      
+        ├─Signs
+        │  │  Signatures.xml
+        │  │  
+        │  └─Sign_0
+        │          Signature.xml
+        │          SignedValue.dat
+        │          
+        ├─Tags
+        │      CustomTag.xml
+        │      CustomTags.xml
+        │      
+        └─Tpls
+            └─Tpl_0
+                    Content.xml
+"""
+class CurId(object):
+    """文档内id控制对象"""
+    def __init__(self):
+        self.id = 1
+        self.used = False
+        self.uuid_map = {} # 资源文件生成id的时候手动添加进来后面构建page 可以 匹配ResourceID
+
+    def add_uuid_map(self, k, v):
+        # logger.debug(f"uuid_map add {k}: {v}")
+        self.uuid_map[k] = v
+    def add(self):
+        self.id += 1
+
+    def get_id(self):
+        if self.used:
+            self.add()
+            return self.id
+        if not self.used:
+            cur_id = self.id
+            self.used =True
+            return cur_id
+
+    def get_max_id(self):
+        MaxUnitID = self.id + 1
+        return MaxUnitID
+
+class TemplateBase(object):
+    """模板基类"""
+    key_map = {}  # 变量名对应 xml 中形式 映射 如 传入   DocID -> ofd:DocID
+    id_keys = [ ]  # 对需要的要素添加 "@ID"
+    template_name = ""
+    def __init__(self,*args,**kwargs):
+        # print(args)
+        # print(kwargs)
+        self.id_obj: CurId = kwargs.get("id_obj")
+        # print("id_obj", self.id_obj)
+        self.assemble(*args, **kwargs)
+
+
+    def assemble(self,*args, **kwargs):
+        """对ofdjson组装"""
+
+        self.final_json = copy.deepcopy(self.ofdjson)
+
+        # 往模板里面添加要素值
+        if kwargs:
+            for k, v in kwargs.items():
+                if k in self.key_map:
+                    self.modify(self.final_json, self.key_map[k], v)
+
+        # 添加id
+        for id_key in self.id_keys:
+            print(f"开始gen_id >> {self.template_name}>>{id_key}")
+            # print(f"final_json {self.final_json}")
+            self.gen_id(self.final_json, id_key)
+
+    def gen_id(self,ofdjson, id_key):
+        """生成id"""
+        # print("id_key ", id_key, "ofdjson ", ofdjson)
+
+        for k, v in ofdjson.items():
+            if k == id_key:
+                # 添加id
+                if isinstance(ofdjson[k], dict):
+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
+
+                    # logger.info(f"添加id -> {ofdjson[k]}")
+                elif isinstance(ofdjson[k], list):
+                    for i in ofdjson[k]:
+                        i["@ID"] = f"{self.id_obj.get_id()}"
+
+                        # logger.info(f"添加id ->i {i}")
+
+            elif isinstance(v, dict):
+                # logger.debug(f"dict_v{v}")
+                self.gen_id(v, id_key)
+
+
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        # logger.debug(f"dict_v{v}")
+                        self.gen_id(v_cell, id_key)
+
+                    
+    def modify(self, ofdjson, key, value):
+        """对指定key的值更改  多个会统一改"""
+        
+        for k, v in ofdjson.items():
+            if k == key:
+                ofdjson[k] = value
+            elif isinstance(v, dict):
+                self.modify(v, key, value)
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        self.modify(v_cell, key, value)
+    
+    def save(self, path):
+        xml_data = xmltodict.unparse(self.final_json, pretty=True)
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(xml_data)
+
+class OFDTemplate(TemplateBase):
+    """根节点全局唯一 OFD.xml"""
+    template_name = "OFD"
+    key_map = {"Author": "ofd:Author", "DocID": "ofd:DocID"  ,"CreationDate": "ofd:CreationDate"
+    }
+
+    ofdjson = {
+
+        "ofd:OFD": {
+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+            "@Version": "1.1",
+            "@DocType": "OFD",
+            "ofd:DocBody": [{
+                "ofd:DocInfo": {
+                    "ofd:DocID": "0C1D4F7159954EEEDE517F7285E84DC4",
+                    "ofd:Creator": "easyofd",
+                    "ofd:author": "renoyuan",
+                    "ofd:authoremail": "renoyuan@foxmail.com",
+                    "ofd:CreatorVersion": "1.0",
+                    "ofd:CreationDate": "2023-10-27"
+                },
+                "ofd:DocRoot": "Doc_0/Document.xml"
+            }]
+        }
+    }
+
+class DocumentTemplate(TemplateBase):
+    """DOC 内唯一 表示DOC内部结构 Document.xml
+
+    """
+    template_name = "Document"
+    key_map = {"Page": "ofd:Page","PhysicalBox":"ofd:PhysicalBox"}
+    id_keys = ["ofd:Page"]
+    ofdjson ={
+    "ofd:Document": {
+        "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+        "ofd:CommonData": {
+            "ofd:MaxUnitID": 0,
+            "ofd:PageArea": {
+                "ofd:PhysicalBox": "0 0 140 90"
+            },
+            "ofd:PublicRes": "PublicRes.xml",
+            "ofd:DocumentRes": "DocumentRes.xml"
+        },
+        "ofd:Pages":
+            {
+            "ofd:Page": [{
+                "@ID": 0,
+                "@BaseLoc": "Pages/Page_0/Content.xml"
+            }]
+        }
+    }
+}
+
+    def update_max_unit_id(self, final_json=None):
+        if not final_json:
+            final_json = self.final_json
+
+        for k, v in final_json.items():
+            if k == "ofd:MaxUnitID":
+                final_json["ofd:MaxUnitID"]=self.id_obj.get_max_id()
+                return
+
+            elif isinstance(v, dict):
+                self.update_max_unit_id(v)
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        self.update_max_unit_id(v_cell)
+
+    def update_page(self,page_num):
+        pass
+
+class DocumentResTemplate(TemplateBase):
+    """DOC 内唯一 表示MultyMedia 资源信息 如 图片 DocumentRes.xml """
+    template_name = "DocumentRes"
+    key_map = {"MultiMedia": "ofd:MultiMedia"}
+    id_keys = ["ofd:DrawParam", "ofd:MultiMedia"]
+    ofdjson = {
+        "ofd:Res": {
+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+            "@BaseLoc": "Res",
+            "ofd:MultiMedias": {
+                "ofd:MultiMedia": [
+                    {
+                        "@ID": 0,
+                        "@Type": "Image",
+                        "ofd:MediaFile": "Image_2.jpg"
+                    }
+                ]
+            }
+        }
+    }
+    def gen_id(self,ofdjson, id_key):
+        """生成id"""
+        # print("id_key ", id_key, "ofdjson ", ofdjson)
+
+        for k, v in ofdjson.items():
+            if k == id_key:
+                # 添加id
+                if isinstance(ofdjson[k], dict):
+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
+
+                    res_uuid = ofdjson[k].get("res_uuid")
+                    if res_uuid:
+                        self.id_obj.add_uuid_map(res_uuid, ofdjson[k]["@ID"])
+                    # logger.info(f"添加id -> {ofdjson[k]}")
+                elif isinstance(ofdjson[k], list):
+                    for i in ofdjson[k]:
+
+                        i["@ID"] = f"{self.id_obj.get_id()}"
+                        res_uuid = i.get("res_uuid")
+                        if res_uuid:
+                            self.id_obj.add_uuid_map(res_uuid, i["@ID"])
+                        # logger.info(f"添加id ->i {i}")
+
+            elif isinstance(v, dict):
+                # logger.debug(f"dict_v{v}")
+                self.gen_id(v, id_key)
+
+
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        # logger.debug(f"dict_v{v}")
+                        self.gen_id(v_cell, id_key)
+
+class PublicResTemplate(TemplateBase):
+    """DOC 内唯一 公共配置资源信息 如 Font  Color 等 PublicRes.xml"""
+    template_name = "PulicRes"
+    key_map = {"Font": "ofd:Font"}
+    id_keys = ["ofd:ColorSpace", "ofd:Font"]
+    ofdjson = {
+        "ofd:Res": {
+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+            "@BaseLoc": "Res",
+            "ofd:ColorSpaces": {
+                "ofd:ColorSpace": {
+                    "@ID": 0,
+                    "@Type": "RGB",
+                    "@BitsPerComponent": "8",
+                    "#text":""
+                }
+            },
+            "ofd:Fonts": {
+                "ofd:Font": [
+                {
+                    "@ID": 0,
+                    "@FontName": "宋体",
+                    "@FamilyName": "宋体",
+
+                }
+            ]
+            }
+        }
+    }
+    def gen_id(self,ofdjson, id_key):
+        """生成id"""
+        # print("id_key ", id_key, "ofdjson ", ofdjson)
+
+        for k, v in ofdjson.items():
+            if k == id_key:
+                # 添加id
+                if isinstance(ofdjson[k], dict):
+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
+                    res_uuid = ofdjson[k].get("res_uuid")
+                    if res_uuid:
+                        self.id_obj.add_uuid_map(res_uuid, ofdjson[k]["@ID"])
+                    # logger.info(f"添加id -> {ofdjson[k]}")
+                elif isinstance(ofdjson[k], list):
+                    for i in ofdjson[k]:
+
+                        i["@ID"] = f"{self.id_obj.get_id()}"
+                        res_uuid = i.get("res_uuid")
+                        if res_uuid:
+                            self.id_obj.add_uuid_map(res_uuid, i["@ID"])
+                        # logger.info(f"添加id ->i {i}")
+
+            elif isinstance(v, dict):
+                # logger.debug(f"dict_v{v}")
+                self.gen_id(v, id_key)
+
+
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        # logger.debug(f"dict_v{v}")
+                        self.gen_id(v_cell, id_key)
+
+'''
+    "ofd:Font": [
+
+    {
+        "@ID": 0,
+        "@FontName": "STSong",
+        "@FamilyName": "SimSun",
+        "@Serif": "true",
+        "@FixedWidth": "true",
+        "@Charset": "prc"
+    }
+            "ofd:Area": {
+            "ofd:PhysicalBox": "0 0 210 140"
+        },
+'''
+
+
+class ContentTemplate(TemplateBase):
+    """正文部分 Content.xml"""
+    #"@Type": "Body",
+    template_name = "Content"
+    key_map = {"ImageObject": "ofd:ImageObject",
+               "PathObject": "ofd:PathObject",
+               "TextObject": "ofd:TextObject",
+               "CGTransform": "ofd:CGTransform",
+               "PhysicalBox": "ofd:PhysicalBox",
+               }
+    id_keys = ["ofd:Layer", "ofd:TextObject", "ofd:PathObject", "ofd:Clips", "ofd:ImageObject"]
+    correlate_map = {"ofd:TextObject": "@Font",
+                     "ofd:ImageObject": "@ResourceID"
+
+                     }
+
+    ofdjson = {
+    "ofd:Page": {
+        "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+
+        "ofd:Content": {
+            "ofd:PageArea": {
+                "ofd:PhysicalBox": "0 0 210 140"
+            },
+            "ofd:Layer":  {
+                "@ID": 0,
+                "@Type": "Foreground",
+
+
+                "ofd:TextObject": [{
+                        "@ID": 0,
+                        "@CTM": "7.054 0 0 7.054 0 134.026",
+                        "@Boundary": "69 7 72 7.6749",
+                        "@Font": "69",
+                        "@Size": "6.7028",
+                        "ofd:FillColor": {
+                            "@ColorSpace": "4",
+                            "@Value": "156 82 35"
+                        },
+                        "ofd:CGTransform": {
+                            "@CodePosition": "0",
+                            "@CodeCount": "10",
+                            "@GlyphCount": "10",
+                            "ofd:Glyphs": "18 10 11 42 60 53 24 11 42 61"
+                        },
+                        "ofd:TextCode": {
+                            "@X": "13.925",
+                            "@Y": "10",
+                            "@DeltaX": "7 7 7 7 7 7 7 7 7",
+                            "#text": "电⼦发票(普通发票)"
+                        }
+                    }],
+                "ofd:ImageObject": []
+                }
+        }}}
+    def __init__(self,*args,**kwargs):
+        # print(args)
+        # print(kwargs)
+        super().__init__(*args, **kwargs)
+        # 关联res_uuid
+        for key, targe_key in self.correlate_map.items():
+            self.correlate_res_uuid(self.final_json,key,targe_key)
+
+    def correlate_res_uuid(self, ofdjson,key,targe_key):
+        """correlate_res_uuid"""
+        print("========uuid_map", self.id_obj.uuid_map)
+        for k, v in ofdjson.items():
+            if k == key:
+                res_uuid = v_cell.pop("res_uuid", None)
+                if isinstance(v, dict) and res_uuid:
+
+                    v[targe_key] = self.id_obj.uuid_map[res_uuid]
+                    # logger.debug(f'{targe_key} >>> {v[targe_key]} -- {res_uuid}')
+                elif isinstance(v, list):
+                    for v_cell in v:
+                        res_uuid = None
+                        if isinstance(v_cell, dict):
+                            res_uuid = v_cell.pop("res_uuid", None)
+                        if isinstance(v_cell, dict) and res_uuid:
+
+                            v_cell[targe_key] = self.id_obj.uuid_map[res_uuid]
+                            # logger.debug(f'{targe_key} >>> {v_cell[targe_key]} -- {res_uuid}')
+                        else:
+                            pass
+                            # print(f"v_cell {v_cell}")
+                    pass
+                else:
+                    pass
+            elif isinstance(v, dict):
+                self.correlate_res_uuid(v, key, targe_key)
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        self.correlate_res_uuid(v_cell, key, targe_key)
+
+
+'''
+                "ofd:PathObject": [{
+                        "@ID": 0,
+                        "@CTM": "0.3527 0 0 -0.3527 0.35 141.43001",
+                        "@Boundary": "-0.35 -0.35 212.33 141.78999",
+                        "@LineWidth": "1",
+                        "@MiterLimit": "10",
+                        "@Stroke": "false",
+                        "@Fill": "true",
+                        "ofd:FillColor": {
+                            "@ColorSpace": "4",
+                            "@Value": "255 255 255"
+                        },
+                        "ofd:StrokeColor": {
+                            "@ColorSpace": "4",
+                            "@Value": "0 0 0"
+                        },
+                        "ofd:Clips": {
+                            "ofd:Clip": {
+                                "ofd:Area": {
+                                    "ofd:Path": {
+                                        "@ID": 0,
+                                        "@Boundary": "0.00766 -0.00763 600 400.00003",
+                                        "@Stroke": "false",
+                                        "@Fill": "true",
+                                        "ofd:AbbreviatedData": "M 0 0 L 600 0 L 600 400.00003 L 0 400.00003 C"
+                                    }
+                                }
+                            }
+                        },
+                        "ofd:AbbreviatedData": "M -1 401 L 601 401 L 601 -1 L -1 -1 C"
+                    },],
+                
+"ofd:ImageObject": [{
+                        "@ID": 0,
+                        "@CTM": "19.7512 0 0 19.7512 0 0",
+                        "@Boundary": "7.23035 7.40671 19.7512 19.7512",
+                        "@ResourceID": "104"
+                    }],
+'''
+
+class OFDStructure(object):
+    """OFD structure"""
+    def __init__(self, name, ofd=None, document=None,
+                 document_res=None, public_res=None,
+                  content_res:list=[], res_static: dict={}):
+        # 初始化的时候会先自动初始化 默认参数值
+        id_obj = CurId()
+        self.name = name
+        self.ofd = ofd if ofd else OFDTemplate(id_obj=id_obj)
+        self.document = document if document else DocumentTemplate(id_obj=id_obj)
+        self.document_res = document_res if document_res else  DocumentResTemplate(id_obj=id_obj)
+        self.public_res = public_res if public_res else PublicResTemplate(id_obj=id_obj)
+        self.content_res = content_res if content_res else [ContentTemplate(id_obj=id_obj)]
+        self.res_static = res_static
+       
+    def __call__(self, test=False):
+        """写入文件生成ofd"""
+        with tempfile.TemporaryDirectory() as t_dir:
+            if test:
+                temp_dir = r"./test"
+                os.mkdir(temp_dir)
+            else:
+                temp_dir = t_dir
+            # 创建过程目录
+            temp_dir_doc_0 = os.path.join(temp_dir, 'Doc_0')
+            temp_dir_pages = os.path.join(temp_dir, 'Doc_0', "Pages")
+            temp_dir_res = os.path.join(temp_dir, 'Doc_0', "Res")  # 静态资源路径
+            for i in [temp_dir_doc_0, temp_dir_pages, temp_dir_res]:
+                # print(i)
+                os.mkdir(i)
+
+            # 写入 OFD
+            self.ofd.save(os.path.join(temp_dir, 'OFD.xml'))
+
+            # 更新 max_unit_id & 写入 Document
+            self.document.update_max_unit_id()
+            self.document.save(os.path.join(temp_dir_doc_0, 'Document.xml'))
+
+            # 写入 DocumentRes
+            self.document_res.save(os.path.join(temp_dir_doc_0, 'DocumentRes.xml'))
+
+            # 写入 PublicRes
+            self.public_res.save(os.path.join(temp_dir_doc_0, 'PublicRes.xml'))
+
+            # 写入 content_res
+            for idx, page in enumerate(self.content_res):
+                temp_dir_pages_idx = os.path.join(temp_dir_pages, f"Page_{idx}")
+                os.mkdir(temp_dir_pages_idx)
+                # os.mkdir(i)
+                page.save(os.path.join(temp_dir_pages_idx, 'Content.xml'))
+
+            # 写入静态资源
+            for k, v in self.res_static.items():
+                  with open(os.path.join(temp_dir_res, k), "wb") as f:
+                      f.write(v)
+
+            # 打包成ofd
+            zip = zipfile.ZipFile("test.ofd", "w", zipfile.ZIP_DEFLATED)
+            for path, dirnames, filenames in os.walk(temp_dir):
+                # 去掉目标跟路径,只对目标文件夹下边的文件及文件夹进行压缩
+                fpath = path.replace(temp_dir, '')
+
+                for filename in filenames:
+                    zip.write(os.path.join(path, filename), os.path.join(fpath, filename))
+            zip.close()
+            with open("test.ofd", "rb") as f:
+                content = f.read()
+            if os.path.exists("test.ofd"):
+               os.remove("test.ofd")
+            return content
+
+if  __name__ == "__main__":
+    print("---------")
+    # 资源文件
+    img_path = r"F:\code\easyofd\test\test_img0.jpg"
+    # with open(img_path, "rb") as f:
+    #     content = f.read()
+    content = b""
+    res_static = {"Image_0.jpg": content}
+
+    # 构建数据
+    font = [
+            {
+
+                "@FontName": "宋体",
+                "@FamilyName": "宋体",
+
+            }
+            ]
+
+    MultiMedia = [
+                {
+
+                    "@Type": "Image",
+                    "ofd:MediaFile": "Image_0.jpg"
+                }
+            ]
+
+    ImageObject = [{
+
+                        "@CTM": "200 0 0 140 0 0",
+                        "@Boundary": "0 0 200 140",
+                        "@ResourceID": "55"
+                    }]
+    TextObject = [
+        {
+
+
+        "@Boundary": "50 5 100 20",
+        "@Font": "2",
+        "@Size": "5",
+        "ofd:FillColor": {
+
+            "@Value": "156 82 35",
+            "@ColorSpace" : "1"
+        },
+
+        "ofd:TextCode": {
+            "@X": "5",
+            "@Y": "5",
+            "@DeltaX": "7 7 7 7 7 7 7 7 7",
+            "#text": "电⼦发票(普通发票)"
+        }
+    }, {
+
+
+        "@Boundary": "0 0 100 100",
+        "@Font": "2",
+        "@Size": "10",
+        "ofd:FillColor": {
+
+            "@Value": "156 82 35"
+        },
+
+        "ofd:TextCode": {
+            "@X": "0",
+            "@Y": "0",
+            "@DeltaX": "0",
+            "#text": "电"
+        }
+    }
+    ]
+
+    # 实例化模板
+    id_obj = CurId()
+    print("id_obj实例化", id_obj)
+
+    ofd = OFDTemplate(id_obj=id_obj)
+    document = DocumentTemplate(id_obj=id_obj)
+    public_res = PublicResTemplate(Font=font, id_obj=id_obj)
+    document_res = DocumentResTemplate(MultiMedia=MultiMedia, id_obj=id_obj)
+    # ImageObject=ImageObject
+    content_res = ContentTemplate(CGTransform=[], PathObject=[], TextObject=TextObject, ImageObject=[], id_obj=id_obj)
+
+
+
+    ofd_byte = OFDStructure("123",ofd=ofd, document=document,public_res=public_res,
+                            document_res=document_res, content_res=[content_res], res_static=res_static)(test=True)
+
+    with open("test.ofd", "wb") as f:
+        content = f.write(ofd_byte)

+ 966 - 0
format_convert/easyofd/easyofd/draw/pdf_parse.py

@@ -0,0 +1,966 @@
+import os
+import re
+import io
+
+import json
+import time
+import copy
+import string
+import random
+from uuid import uuid1
+from decimal import Decimal
+from collections import OrderedDict
+
+# 第三方包
+import fitz
+from PIL import Image
+# import pdfplumber
+
+__ALL__ = ['pdf_ocr',"DPFParser"]
+
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, bytes):
+            return str(obj)
+        elif isinstance(obj, Decimal):
+            return float(obj)
+        return json.JSONEncoder.default(self, obj)
+
+class DPFParser(object):
+    def __init__(self, ):
+        pass
+
+    def extract_text_with_details(self, pdf_bytes):
+        """
+        提取PDF每页的文本及其位置、字体信息。
+
+        :param pdf_path: PDF文件路径
+        :return: 包含每页文本及其详细信息的列表
+        [[
+
+        ]]
+        """
+        details_list = []
+        pdf_stream = io.BytesIO(pdf_bytes)
+
+        # 使用fitz.open直接打开BytesIO对象
+
+        with fitz.open(stream=pdf_stream, filetype="pdf") as doc:
+            res_uuid_map = {
+                "img": {},
+                "font": {},
+                "other": {}
+            } # 全局资源标识
+            for page_num in range(len(doc)):
+
+
+                page_details_list = []  # 页面内信息
+                page = doc.load_page(page_num)
+                rect = page.rect
+                width = rect.width
+                height = rect.height
+                if res_uuid_map["other"].get("page_size"):
+                    res_uuid_map["other"]["page_size"][page_num] = [width,height]
+                else :
+                    res_uuid_map["other"]["page_size"] = {page_num: [width, height]}
+                blocks = page.get_text("dict").get("blocks")  # 获取文本块信息
+                image_list = page.get_images(full=True)  # 获取页面上所有图片的详细信息
+                # print(blocks)
+                # 获取页面内文本信息
+                for block in blocks:
+                    block_text = block.get("text", "")
+                    block_rect = block["bbox"]  # 文本块的边界框,格式为[x0, y0, x1, y1]
+
+                    # 遍历块中的每一行
+                    for line in block.get("lines", []):
+                        line_text = line.get("spans", [{}])[0].get("text", "")  # 单行文本
+                        line_rect = line["bbox"]  # 行的边界框
+
+                        # 遍历行中的每一个跨度(span),获取字体信息
+                        for span in line.get("spans", []):
+                            span_text = span.get("text", "")
+                            font_size = span.get("size")  # 字体大小
+                            font_name = span.get("font")  # 字体名称
+                            res_uuid = None
+                            if font_name not in res_uuid_map["font"].values():
+                                res_uuid = str(uuid1())
+                                res_uuid_map["font"][res_uuid] = font_name
+                            else:
+                                keys = list(res_uuid_map["font"].keys())
+                                vs = list(res_uuid_map["font"].values())
+                                idx = vs.index(font_name)
+                                res_uuid =keys[idx]
+                            font_color = span.get("color")  # 字体颜色,默认可能没有
+                            span_rect = (
+                            line_rect[0], line_rect[1], line_rect[2], line_rect[3])  # 使用行的边界框作为参考,具体到单个字符或词可能需要更复杂的处理
+
+                            # 打印或存储信息
+                            print(
+                                f"Page: {page_num }, Text: '{span_text}', Font: {font_name}, Size: {font_size}, "
+                                f"Color: {font_color}, Rect: {span_rect} ,res_uuid {res_uuid}")
+
+                            # 存储信息到details_list中(根据需要调整存储格式)
+                            page_details_list.append({
+                                "page": page_num,
+                                "text": span_text,
+                                "font": font_name,
+                                "res_uuid": res_uuid,
+                                "size": font_size,
+                                "color": font_color,
+                                "bbox": list(span_rect),
+                                "type": "text"
+                            })
+
+                for image_index, img_info in enumerate(image_list):
+                    # 解析图片信息
+                    xref = img_info[0]
+                    base_image = doc.extract_image(xref)
+
+                    image_data = base_image["image"]  # 图片数据
+                    res_uuid = str(uuid1())
+
+                    img_io = io.BytesIO(image_data)
+                    res_uuid_map["img"][res_uuid] = img_io
+                    image_type = base_image["ext"]  # 图片类型
+                    smask = base_image["smask"]  # 图片类型
+                    xres = base_image["xres"]  # 图片类型
+                    yres = base_image["yres"]  # 图片类型
+                    width = base_image["width"]  # 图片宽度
+                    height = base_image["height"]  # 图片高度
+
+
+
+                    # 计算坐标(左下角和右上角)
+                    x0, y0, x1, y1 = xres, yres,xres+width,yres+height
+                    print(
+                        f"Page: {page_num}, image_type: '{image_type}',x0{x0}, y0{y0}, x1{x1}, y1{y1}  ")
+                    page_details_list.append({
+                        "page": page_num,
+                        "index": image_index,
+                        "x0": x0,
+                        "y0": y0,
+                        "x1": x1,
+                        "y1": y1,
+                        "bbox": [x0,y0,width,height],
+                        "width": width,
+                        "height": height,
+                        "res_uuid": res_uuid,
+                        "image_type": image_type,
+                        "type": "img"
+                    })
+
+                details_list.append(page_details_list)
+        # print("details_list",details_list)
+        return details_list, res_uuid_map
+    def to_img(self, buffer_pdf):
+        """pdf2img"""
+        pix_list = []
+        pdfDoc = fitz.open(stream=buffer_pdf)
+        for pg in range(pdfDoc.page_count):
+            page = pdfDoc[pg]
+            rotate = int(0)
+            # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
+            # 此处若是不做设置,默认图片大小为:792X612, dpi=96
+            zoom_x = 1.33333333 #(1.33333333-->1056x816)   (2-->1584x1224)
+            zoom_y = 1.33333333
+            # zoom_x,zoom_y = (1,1)
+            mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
+            pix = page.get_pixmap(matrix=mat, alpha=False)
+
+
+            pix_list.append(pix)
+        return pix_list
+           
+            
+            
+    def get_size(self):
+        pass
+    
+def coast_time(func):
+    '''
+    计算对象执行耗时
+    '''
+    def fun(*agrs, **kwargs):
+        t = time.perf_counter()
+        result = func(*agrs, **kwargs)
+        print(f'function {func.__name__} coast time: {time.perf_counter() - t:.8f} s')
+        return result
+    return fun
+
+
+class BaseInit:
+    '''
+    解析pdf所需的基本信息
+    '''
+
+    def __init__(self, pdf_path, output_path):
+
+        self.file_path = pdf_path
+        self.output_path = output_path
+        # file_name
+        self.file_name = os.path.basename(self.file_path)
+        # file_type
+        self.fileType = os.path.splitext(self.file_path)[-1]
+        # no suffix
+        self.file_no_suffix = self.file_name[:-len(self.fileType)]
+        self.uuidChars = tuple(list(string.ascii_letters) + list(range(10)))
+        # 表格占位、分割符
+        self.divide = ':'
+        self.solid = ''
+        # 初始化整个过程需要创建的中间目录
+        # iou 占比
+        self.iou_rate = 0.001
+        self.init_file()
+
+    def init_file(self):
+        """
+        初始化项目过程中需要创建的文件夹
+        """
+        self.image_folder_path = os.path.join(self.output_path, 'pdf_img_save')
+        self.json_folder_path = os.path.join(self.output_path, 'json')
+        self.ocr_result_path = os.path.join(self.json_folder_path, self.file_no_suffix + '.json')
+        # 后面还有txt..., 目前的流程先需要5个
+        for path in [self.image_folder_path, self.json_folder_path]:
+            if not os.path.exists(path):
+                os.makedirs(path)
+
+    def genShortId(self, length=12):
+        """
+        :params length: 默认随机生成的uuid长度
+        """
+        uuid = str(uuid1()).replace('-', '')
+        result = ''
+        for i in range(0, 8):
+            sub = uuid[i * 4: i * 4 + 4]
+            x = int(sub, 16)
+            result += str(self.uuidChars[x % 0x3E])
+        return result + ''.join(random.sample(uuid, length - 8))
+
+
+class PageInfo(BaseInit):
+    '''
+    记录每页中的 图片和表格信息
+    '''
+    __page_image = {}
+    __page_table = {}
+
+    @classmethod
+    def add_image(cls, page_num, image):
+        if not cls.__page_image.get(page_num):
+            cls.__page_image[page_num] = []
+        cls.__page_image[page_num].append(image)
+
+    @classmethod
+    def add_table(cls, page_num, table):
+        if not cls.__page_table.get(page_num):
+            cls.__page_table[page_num] = []
+        cls.__page_table[page_num].append(table)
+
+    @classmethod
+    def get_image(cls, page_num):
+        return cls.__page_image.get(page_num, [])
+
+    @classmethod
+    def get_table(cls, page_num):
+        return cls.__page_table.get(page_num, [])
+
+    @classmethod
+    def save_image(cls, output_path, file):
+        '''
+        保存图片至本地
+        :param output:
+        :return:
+        '''
+        file = file.split('.')[0]
+        for images in cls.__page_image.values():
+            for image in images:
+                iamge_content = image['objContent']
+                name = image['name']
+                img_dir = os.path.join(output_path, 'page_img_save')
+                img_path = os.path.join(img_dir, file + '_' + name + '.jpg')
+                if not os.path.exists(img_dir):
+                    os.mkdir(img_dir)
+                with open(img_path, 'wb') as fp:
+                    fp.write(iamge_content)
+
+
+class ParseFile(PageInfo):
+
+    def __init__(self, pdf_path, output_path, table_type='v2', is_save=True):
+        super().__init__(pdf_path, output_path)
+        print('初始化 pdf 对象:{}'.format(self.file_path))
+        self.is_save = is_save
+        self.table_type = table_type
+        # 第一版结果列表: 行 表分开
+        self.page_result_list = []
+        # 第二版结果列表: 行表合并
+        self.combine_page_result_list = []
+
+    @coast_time
+    def get_result(self):
+        self.load_pdf()
+        result = self.parse_pdf()
+        self.ocr_result = result
+        print(f'解析完成:共 {len(result)} 页  表格类型: {self.table_type}')
+        return result
+
+    def load_pdf(self):
+        self.fitz_doc = fitz.open(self.file_path, filetype='pdf')
+        # self.pdfplum_doc_pages = pdfplumber.open(self.file_path).pages
+        # assert len(self.fitz_doc) == len(self.pdfplum_doc_pages)
+
+    def parse_pdf(self):
+        for page_no, fitz_doc in enumerate(self.fitz_doc):
+            # 测试
+            # if page_no != 25:
+            #     continue
+            self.height = fitz_doc.get_text('dict')['height']
+            self.width = fitz_doc.get_text('dict')['width']
+            # 聚合fitz页面解析的字符, 行, 块信息
+            line_list = self.group_block(page_no, fitz_doc)
+            # 获取页面表格信息
+            table_list = self.extract_table(page_no, self.pdfplum_doc_pages[page_no])
+            # 计算表格行列合并信息
+            table_list = list(CalcTableRL(table_list).run())
+            # 获取页面图片信息
+            image_list = self.get_image(page_no)
+            # 构造每页最终返回结果,
+            page_result = self.construct_final_result(line_list, page_no, image_list, table_list)
+
+            if self.table_type == 'v2':
+                # 合并成ocr所需格式:表格合并至行列表
+                combine_page_result_list = self.combine_table_v2(page_result)
+                page_result = self.construct_final_result(combine_page_result_list, page_no, image_list, table_list)
+
+            self.page_result_list.append(page_result)
+            if page_no and  page_no % 10 == 0:
+                print(f'解析前 {page_no} 页完成')
+        final_result_list = copy.deepcopy(self.page_result_list)
+        # 转换为符合ocr解析格式
+        if self.table_type == 'v2':
+            final_result_list = self.reform_ocr_result(final_result_list)
+        # 2023/09/26 保存之前加入 contIndex 给后续 抽取模型使用
+        for page_num, page in enumerate(final_result_list):
+            if not page.get('lineList'):
+                break
+            contIndex = {}
+            for line in page['lineList']:
+                line_bak = dict(copy.copy(line))
+                line_bak["objType_postpreprocess"] = f"{line_bak.get('objType','textLine')}_postpreprocess"
+                contIndex[line_bak["lineId"]] = line_bak
+            
+            page["contIndex"] = contIndex
+            for line in page['lineList']:
+                print(page_num, line['objType'], line['objContent'])
+        # 保存至本地
+        if self.is_save:
+            self.save_result(final_result_list)
+        for page_num, page in enumerate(final_result_list):
+            for line in page['lineList']:
+                print(page_num, line['objType'], line['objContent'])
+        return final_result_list
+
+    def combine_table_v2(self, page_result):
+        lineList = page_result['lineList']
+        table_list = page_result['table_list']
+        # 先进行表格行、非表格行划分 减少后续操作的时间杂度
+        __notable_lines, __all_table_lines = self.filter_table_line(lineList, table_list)
+        notable_lines, all_table_lines = copy.deepcopy(__notable_lines), copy.deepcopy(__all_table_lines)
+        del __notable_lines, __all_table_lines, lineList
+        # 整合
+        combine_page_result_list = self.combine_table_with_line(notable_lines, all_table_lines, table_list)
+        return combine_page_result_list
+
+    def filter_table_line(self, lineList, table_list):
+        '''
+        筛选出属于表格的行、在 __notable_lines 属于表格的位置插庄 方便后续补全
+        __notable_lines: 非表格的行
+        __all_table_lines:属于表格的行
+        '''
+        __notable_lines = []
+        __all_table_lines = []
+        for table_info in table_list:
+            table_bbox = table_info['objPos']
+            # 属于当前表格的所有行
+            __sub_table_lines = []
+            is_iter_table = False
+            while lineList:
+                line = lineList.pop(0)
+                line_bbox = line['objPos']
+                # 空表格误判:行Y坐标已经超过表范围导致后续全都识别不到
+                table_y, line_y = table_bbox[3], line_bbox[1]
+                if line_y >= table_y:
+                    lineList.insert(0, line)
+                    break
+                iou = self.count_iou(table_bbox, line_bbox)
+                # 非表格区域
+                if iou > 0:
+                    __sub_table_lines.append(line)
+                    # 首次匹配到表格行
+                    if not is_iter_table:
+                        is_iter_table = True
+                        # 插入标记
+                        __notable_lines.append('table')
+                elif iou <= 0 and not is_iter_table:
+                    __notable_lines.append(line)
+                # 当前表格判断结束
+                elif iou <= 0 and is_iter_table:
+                    lineList.insert(0, line)
+                    line_index, flag = self.more_judge(table_bbox, lineList)
+                    if flag:
+                        # 跳至index位置继续后续判断
+                        # more_lines = copy.deepcopy()
+                        __notable_lines.extend(lineList[:line_index])
+                        lineList = lineList[line_index:]
+                    else:
+                        break
+            __all_table_lines.append(__sub_table_lines)
+        # 表格遍历替换完毕, 合并剩下的 page_words
+        if lineList:
+            __notable_lines.extend(lineList)
+        return __notable_lines, __all_table_lines
+
+    def more_judge(self, table_bbox, lineList, max_judge=6):
+        '''
+        判断后续行列表是否还存在属于当前表格的行
+        对于表格、行界限不明显的额外判断 如: 页面分栏、表格不全
+        :return 是否存在 True | False
+        '''
+        # 往后多判断 max_judge 行
+        if len(lineList) < max_judge:
+            judge_lines = lineList
+        else:
+            judge_lines = lineList[:max_judge]
+        for index, line in enumerate(judge_lines):
+            line_bbox = line['objPos']
+            iou = self.count_iou(table_bbox, line_bbox)
+            if iou > 0:
+                return index, True
+        return index, False
+
+
+    def combine_table_with_line(self, notable_lines, all_table_lines, table_list):
+        '''
+        将行、字符合并至对应的表格行、cell
+        '''
+        for table_id, table in enumerate(table_list):
+            new_table_lines = []
+            for table_line in table['lineList']:
+                is_iter_table = False
+                table_line_bbox = table_line['objPos']
+                # 遍历每一行:全局匹配
+                for __line in all_table_lines[table_id]:
+                    line = copy.deepcopy(__line)
+                    line_bbox = line['objPos']
+                    iou = self.count_iou(table_line_bbox, line_bbox)
+                    # 首次识别到表格, 将文本行的文本、坐标替换为表格行文本、坐标,文本行的其他信息不变
+                    if iou > self.iou_rate and not is_iter_table:
+                        is_iter_table = True
+                        line['objContent'] = table_line['objContent']
+                        line['objPos'] = table_line['objPos']
+                        line['objType'] = 'table'
+                        line['tableId'] = table_id
+                        self.combine_cell_with_span(table_line, line)
+                        line['cells'] = table_line['cells']
+                        new_table_lines.append(line)
+                    elif iou > self.iou_rate and is_iter_table:
+                        self.combine_cell_with_span(table_line, line)
+                    else:
+                        pass
+            if 'table' not in notable_lines or not new_table_lines:
+                # FIX ERROR: 'table' is not in list
+                # 处理大表格内识别到小表格的情况
+                # 有可能的bug:如果此时有多个大表格嵌套会导致行分配和插庄个数不对等
+                continue
+            # 将表格行new_table_lines替换之前插庄table位置并展开
+            table_index = notable_lines.index('table')
+            new_notable_lines = notable_lines[:table_index]
+            new_notable_lines.extend(new_table_lines)
+            notable_lines = new_notable_lines + notable_lines[table_index+1:]
+        return notable_lines
+
+    def combine_cell_with_span(self,table_line , text_line):
+        '''
+        将表格的cell内加上对应span的chars信息:解决表格合并时cell有多行导致chars顺序错乱的问题
+        '''
+        del_list = []
+        for index, cell in enumerate(table_line['cells']):
+            if not cell.get('chars'):
+                cell['chars'] = []
+            cell_bbox = cell['objPos']
+            if cell_bbox is None:
+                del_list.append(index)
+                continue
+            for span in  text_line['span']:
+                span_bbox = span['bbox']
+                iou = self.count_iou(cell_bbox, span_bbox)
+                if iou < self.iou_rate:
+                    continue
+                # 为了解决一些 span 和 cell 长度不一致问题 将循环细分到每个字符chars
+                for char in span['chars']:
+                    char_bbox = char['bbox']
+                    iou = self.count_iou(cell_bbox, char_bbox)
+                    if iou > self.iou_rate:
+                        cell['chars'].append(char)
+                    else:
+                        pass
+        # 清除无效的span
+        if len(del_list):
+            for index, index_del in enumerate(del_list):
+                index_del -= index
+                del table_line['cells'][index_del]
+
+    def group_block(self, page_num, fitz_doc):
+        """
+        组合两个方法的block信息, 使每一个span内具有其每一个字符信息
+        参考官方文档:https://pymupdf.readthedocs.io/en/latest/textpage.html#textpagedict
+        :param fitz_doc:
+        :return: total_info
+        """
+        line_count = 0
+        total_line_list = []
+        # char_blocks 最小粒度为每一个字符
+        char_blocks = fitz_doc.get_text('rawdict')['blocks']
+        # block_blocks 最小粒度为每行中的span
+        block_blocks = fitz_doc.get_text('dict')['blocks']
+        # 先进行文本块排序
+        char_blocks.sort(key=lambda x: [int(x['bbox'][1]), int(x['bbox'][0])])
+        block_blocks.sort(key=lambda x: [int(x['bbox'][1]), int(x['bbox'][0])])
+        # 分组聚合
+        group_blocks = zip(block_blocks, char_blocks)
+        for span_blocks, char_block in group_blocks:
+            if span_blocks['type'] == 1:
+                # 保存其中的图片
+                img_attrs = self.deal_image(page_num, line_count, span_blocks)
+                self.add_image(page_num, img_attrs)
+                continue
+            for line_index, line in enumerate(span_blocks['lines']):
+                line['text'] = ''
+                line['chars'] = []
+                line['span'] = []
+                # 减少时间复杂度,在此处合并每一行
+                # 合并每一行,并附上行内每一个字符的信息
+                for span_index, span in enumerate(line['spans']):
+                    span['text'] = span['text'].replace(' ', '').strip()
+                    if not span['text']:
+                        continue
+                    # 给span_blocks中的span加上char_block的chars信息
+                    span_chars = char_block['lines'][line_index]['spans'][span_index]['chars']
+                    span_chars = [char for char in span_chars if char['c'].strip()]
+                    line['text'] += span['text']
+                    line['chars'].extend(span_chars)
+                    line['span'].append({'bbox': span['bbox'], 'chars': span_chars,'text': span['text']})
+                if not line['text']:
+                    continue
+                # 构造每行内部的数据结构
+                line_info = self.construct_line_info(line['text'], line['bbox'], line['span'], line['chars'],
+                                                     line_count, page_num)
+                total_line_list.append(line_info)
+                line_count += 1
+        return total_line_list
+
+    def extract_table(self, page_no, plum_page):
+        '''
+        提取页面所有表格
+        :param page_no:
+        :param plum_page:
+        :return:
+        '''
+        table_list = []
+        for table in plum_page.find_tables():
+            # 获取当前表格的边界定位
+            table_line_list = self.merge_table_row(table)
+            if not table_line_list:
+                continue
+            table_info = self.deal_table(page_no, table.bbox, table_line_list)
+            table_list.append(table_info)
+            # 将表格信息加入全局变量 | 此处有点有点冗余
+            self.add_table(page_no, table_info)
+        return table_list
+
+    def merge_table_row(self, table):
+        '''
+        表格cell 按行合并
+        :param table:
+        :return: [({line_text}, {line_bbox}), ...]
+        '''
+        table_line_list = []
+        for item, row in zip(table.extract(), table.rows):
+            # 表格每行预处理
+            table_line = self.divide.join([self.clear_text(txt) for txt in item])
+            # 判断当前行是否为空
+            __line = self.clear_text(table_line).replace(' ', '')
+            if not __line:
+                continue
+            table_line_list.append((table_line, row.bbox, zip(item, row.cells)))
+        return table_line_list
+
+    def clear_text(self, txt, retrans=False):
+
+        if retrans:
+            txt = txt.replace(self.solid, '').replace(self.divide, '')
+        else:
+            # 空列替换为占位符
+            txt = txt if txt else self.solid
+        return str(txt).replace('\n', '').replace(' ', '')
+
+    def deal_table(self, page_no, table_bbox, table_line_list):
+        '''
+        对表格做结构转换
+        :param page_no:
+        :param table_bbox:
+        :param table_line_list:
+        :return:
+        '''
+        table_first_line = self.clear_text(table_line_list[0][0], retrans=True)
+        table_id = '{0}_{1}_'.format(page_no, table_first_line) + self.genShortId()
+        lineList = [{
+            'objContent': line[0],
+            'objPos': line[1],
+            'cells': self.deal_table_cell(line[2])
+        } for line in table_line_list]
+        table_info = {
+            'tableId': table_id,
+            'name': table_id,
+            'objPos': table_bbox,
+            'lineList': lineList,
+        }
+        return table_info
+
+    def deal_table_cell(self, cells):
+        return [{"objContent": self.clear_text(text), "objPos": box} for text, box in cells]
+
+    def deal_image(self, page_num, name, img_attrs):
+        '''
+        对image做结构转换
+        :param page_num:
+        :param name:
+        :param img_attrs:
+        :return:
+        '''
+        image_id = '{0}_{1}_'.format(page_num, name) + self.genShortId()
+        img_info = {
+            'imageId': image_id,
+            'name': image_id,  # 暂时以图片所在页面的行数命名
+            'objPos': img_attrs['bbox'],
+            'ext': img_attrs['ext'],
+            'objContent': img_attrs['image'],
+            'size': img_attrs['size']
+        }
+        return img_info
+
+    def deal_chars(self, line_num, lineId, chars):
+        '''
+        对chars做结构转换
+        :param line_num:
+        :param lineId:
+        :param chars:
+        :return:
+        '''
+        num_count = 0
+        char_list = []
+        for char in chars:
+            if not char['c'].strip():
+                continue
+            char_dict = {
+                'lineId': lineId,
+                'charId': 'char_' + str(line_num) + '_' + str(num_count) + '_' + self.genShortId(),
+                'objContent': char['c'],
+                'objPos': char['bbox']
+            }
+            char_list.append(char_dict)
+            num_count += 1
+        return char_list
+
+    def construct_line_info(self, text, rect, span, chars, count, pageNo, objType='textLine'):
+        '''
+        对每行做结构转换
+        # x, y, h, w = rect[0], rect[1], rect[3] - rect[1], rect[2] - rect[0]
+        '''
+        lineId = 'line_' + str(pageNo) + '_' + str(count) + '_' + self.genShortId()
+        chars = self.deal_chars(count, lineId, chars)
+        return OrderedDict({
+            'lineNo': count,
+            'lineId': lineId,
+            'objType': objType,
+            'objContent': re.sub(r'\s', '', text),
+            'chars': chars,
+            'objPos': rect,
+            'span': span
+        })
+
+    @staticmethod
+    def rect_format(bbox):
+        '''
+        数据坐标转换 x1, y1, x2, y2 >> y1, x1 h, w
+        :param rect: [x1, y1, x2, y2]
+        :return: [y, x, h, w]
+        '''
+        y, x, h, w = bbox[1], bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0]
+        return [y, x, h, w]
+
+    def count_iou(self, RecA, RecB):
+        '''
+        计算边框交并比
+        左上边界坐标为Ax0, Ay0, Bx0, By0
+        右下边界坐标为Ax1, Ay1, Bx1, By1
+        交集面积计算为:
+            M = min(Ax1, Bx1) - max(Ax0, Bx0)
+            H = min(Ay1, By1) - max(Ay0, By0)
+        # 当前表格的边界信息
+        left_x, top_y, right_x, botm_y: table_box_info[0], table_box_info[1], table_box_info[2], table_box_info[3]
+        '''
+        M = min(RecB[2], RecA[2]) - max(RecB[0], RecA[0])
+        H = min(RecB[3], RecA[3]) - max(RecB[1], RecA[1])
+
+        # 计算交集部分面积
+        interArea = max(0, M) * max(0, H)
+
+        # 计算两个边框的面积
+        RecA_Area = (RecA[2] - RecA[0]) * (RecA[3] - RecA[1])
+        RecB_Area = (RecB[2] - RecB[0]) * (RecB[3] - RecB[1])
+        # 计算IOU
+        iou = interArea / float(RecA_Area + RecB_Area - interArea)
+        return iou
+
+    def construct_final_result(self, line_list, pageNo, image_list=[], table_list=[]):
+        '''
+        每页转换为最终数据结构
+        :param line_list: ocr每行结果
+        :param pageNo: 页码
+        :param image_list:
+        :param table_list:
+        :return: type: Dict
+        '''
+        document_id = 'v1' + '_' + self.file_no_suffix + '_' + self.genShortId()
+        return OrderedDict({
+            'pageNo': pageNo,
+            'docID': document_id,
+            'page_info':{'size': [self.width, self.height]},
+            'lineList': line_list,
+            'image_list': image_list if image_list else [],
+            'table_list': table_list if table_list else []
+        })
+
+    def save_result(self, final_result_list):
+        '''
+        保存结果数据至本地
+        '''
+        if self.table_type == 'v2':
+            with open(self.ocr_result_path, 'w', encoding='utf-8') as f:
+                json.dump(final_result_list, f, indent=4, ensure_ascii=False)
+        else:
+            with open(self.ocr_result_path, 'w', encoding='utf-8') as f:
+                json.dump(self.page_result_list, f, cls=MyEncoder, indent=4, ensure_ascii=False)
+
+    def reform_ocr_result(self, final_result_list):
+        """
+        对返回的结果最最终处理 并 重新定义行号排序
+        :param final_result_list: 本地解析和ocr解析的合并结果
+        """
+        for result_list in final_result_list:
+            del result_list['image_list']
+            del result_list['table_list']
+            lineList = result_list['lineList']
+            for num, line in enumerate(lineList):
+                # 重写行号和行ID
+                line['lineNo'] = str(num)
+                line_split = line['lineId'].split('_')
+                line_split[-2] = str(num)
+                line['lineId'] = '_'.join(line_split)
+                # 转换坐标格式
+                obj_type = line['objType']
+                # 计算每一个字相对于当前行想x,y 的偏移量
+                offset_x_list, offset_y_list = self.coord_offset(line, obj_type)
+                line['objPos'] = self.rect_format(line['objPos'])
+                line['objPos'].append(offset_x_list)
+                line['chars_offset'] = [offset_x_list, offset_y_list]
+                if line.get('chars'):
+                    del line['chars']
+                if obj_type == 'table' and line.get('span'):
+                    del line['span']
+        return final_result_list
+
+    def coord_offset(self, line, obj_type='textLine'):
+        '''
+        计算每个字符的左上角 相对行左上角位置的偏移量
+        @obj_type: textLine | table
+        '''
+        offset_x_list = []
+        offset_y_list = []
+        line_x, line_y = line['objPos'][0], line['objPos'][1]
+        if obj_type == 'textLine':
+            for span in line['span']:
+                self.all_rect_format(span)
+                for char in span['chars']:
+                    char_x, char_y = char['bbox'][0], char['bbox'][1]
+                    offset_x_list.append(char_x - line_x)
+                    offset_y_list.append(char_y - line_y)
+                    self.all_rect_format(char)
+        else:
+            __cells = []
+            for num, _cell in enumerate(line['cells']):
+                cell = copy.deepcopy(_cell)
+                self.all_rect_format(cell)
+                for char in cell['chars']:
+                    char_x, char_y = char['bbox'][0], char['bbox'][1]
+                    offset_x_list.append(char_x - line_x)
+                    offset_y_list.append(char_y - line_y)
+                    self.all_rect_format(char)
+                __cells.append(cell)
+            line['cells'] = __cells
+        return offset_x_list, offset_y_list
+
+    def all_rect_format(self, obj):
+        '''
+        将所有格式转换为ocr所需格式
+        '''
+        if 'chars' in obj:
+            if obj.get('text'):
+                obj['objContent'] = obj['text']
+                del obj['text']
+            if obj.get('objPos'):
+                obj['objPos'] = self.rect_format(obj['objPos'])
+            elif obj.get('bbox'):
+                obj['objPos'] = self.rect_format(obj['bbox'])
+                del obj['bbox']
+        else:
+            obj['objContent'] = obj['c']
+            obj['objPos'] = self.rect_format(obj['bbox'])
+            del obj['c']
+            del obj['bbox']
+
+class CalcTableRL:
+    '''
+    还原表格虚线 计算表格行列合并信息
+    输入目标表格结构信息:必须包含所有的cell坐标
+    在目标表格结构cell上加上row_start_end, col_start_end属性
+    '''
+    def __init__(self, table_info):
+        self.table_info = table_info
+
+    def run(self):
+        if isinstance(self.table_info, list):
+            for table_info in self.table_info:
+                table_info = self.add_table_property(table_info)
+                yield table_info
+        else:
+            table_info = self.add_table_property(self.table_info)
+            yield table_info
+    def add_table_property(self, table_info):
+        '''
+        表格结构增加行列合并信息:
+        cell['col_start_end'] = (col_start, col_end)
+        cell['row_start_end'] = (row_start, row_end)
+        '''
+        # 分别得到所有排序好的行列坐标
+        set_x, set_y = self.collect_table_coord(table_info)
+        # 排序 后的set_x,set_y 坐标集合就是最小粒度表格
+        list_x, list_y = sorted(set_x), sorted(set_y)
+        for line in table_info['lineList']:
+            for cell in line['cells']:
+                if cell['objPos'] == None:
+                    continue
+                x1, y1, x2, y2 = cell['objPos']
+                # 查找坐标点在虚线表格中对应的位置
+                col_start = list_x.index(x1)
+                col_end = list_x.index(x2)
+                row_start = list_y.index(y1)
+                row_end = list_y.index(y2)
+                cell['col_start_end'] = (col_start, col_end)
+                cell['row_start_end'] = (row_start, row_end)
+                # print(f"{cell['objContent']} 属于行:{cell['row_start_end']} 属于列:{cell['col_start_end']}")
+        return table_info
+
+    def collect_table_coord(self, table_info):
+        '''
+        获取所有x, y坐标点
+        传入单个表格信息,提取出其中所有cell的x1, y1, x2, y2坐标点 去重
+        :param table_info:
+        :return: set(x), set(y)
+        '''
+        set_x = set()
+        set_y = set()
+        for line in table_info['lineList']:
+            for cell in line['cells']:
+                if cell['objPos'] == None:
+                    continue
+                x1, y1, x2, y2 = cell['objPos']
+                set_x.add(x1)
+                set_x.add(x2)
+                set_y.add(y1)
+                set_y.add(y2)
+        return set_x, set_y
+
+
+
+def pdf_ocr(pdf_path, output_path, table_type='v2', is_save=True):
+    '''
+    简单封装, 方便调用和多线程
+    '''
+    pdf = ParseFile(pdf_path, output_path, table_type, is_save)
+    pdf.get_result()
+    return pdf
+
+# ---------------------------以下是测试案列-----------------------------------
+
+@coast_time
+def test_dir():
+    for root in os.walk(r'E:\workplace\cjhx_test\创金和信\pdf2json\input\all_test'):
+        dir, files = root[0], root[2]
+        for file in files:
+            if 'test.pdf' not in file:
+                continue
+            file_path = os.path.join(dir, file)
+            output_dir = r'E:\workplace\cjhx_test\创金和信\pdf2json\file_data\all_test'
+            pdf_ocr_result = pdf_ocr(file_path, output_dir)
+
+@coast_time
+def test_single():
+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\测试足够复杂的表格解析.pdf'
+    file_path = r'/home/yhocr/extractor/3f195fba-0916-4d74-b956-bf3bcadc77f2/20220913-浙江省贰号职业年金计划银华资产组合2022年二季度管理费用支付指令.pdf'
+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\公开募集基金销售支付结算机构名录(2022年9月)(1).pdf'
+    # file_path = r'C:\Users\Administrator\Documents\WeChat Files\wxid_x36dhycno4s121\FileStorage\File\2022-11\20210928-ZL001-西部利得天添鑫货币B-申购5000万-确认书.pdf'
+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\2-信息系统部2021年大数据平台系统维护服务--工作记录表和考核表2021Q3-原版.pdf'
+    output_dir = r'/home/yhocr/extractor/3f195fba-0916-4d74-b956-bf3bcadc77f2/电子解析'
+    pdf = pdf_ocr(file_path, output_dir, table_type='v2')
+    # print(pdf.ocr_result)
+
+@coast_time
+def test_thread():
+    # 多进程
+    from concurrent.futures import ProcessPoolExecutor
+    pool = ProcessPoolExecutor(max_workers=8)
+    # 多线程
+    # from concurrent.futures import ThreadPoolExecutor
+    # pool = ThreadPoolExecutor(max_workers=8)
+    for root in os.walk(r'E:\workplace\daily_work\pdf2json\input\签字模板二'):
+        dir, files = root[0], root[2]
+        for file in files:
+            file_path = os.path.join(dir, file)
+            output_dir = r'E:\workplace\daily_work\pdf2json\output\签字模板二'
+            ret = pool.submit(pdf_ocr, file_path, output_dir, table_type='v2')
+            ret.add_done_callback(print_callback)
+    pool.shutdown()
+
+def print_callback(ret):
+    # print('ret:', ret.result())
+    pass
+
+if __name__ == '__main__':
+    # test_dir()
+    # test_thread()
+    # test_single()
+    pdf_obj = DPFParser()
+    with open(r"F:\code\easyofd\test\test.pdf","rb") as f:
+        pdf_bytes = f.read()
+
+    img_list = pdf_obj.to_img(pdf_bytes)
+    pil_img_list = []
+    for _img in img_list:
+        print(_img.width,_img.height)
+        img = Image.frombytes("RGB", [_img.width, _img.height], _img.samples)
+        print(type(img))
+        img.save('output_image.png')
+      
+    

BIN
format_convert/easyofd/easyofd/draw/simsun.ttc


+ 301 - 0
format_convert/easyofd/easyofd/ofd.py

@@ -0,0 +1,301 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: F:\code\easyofd\easyofd
+# CREATE_TIME: 2023-10-07
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# note:  ofd 基础类
+import base64
+import os
+import sys
+from io import BytesIO
+from typing import Union
+
+# sys.path.insert(0, os.getcwd())
+# sys.path.insert(0, "..")
+
+import fitz
+from PIL import Image
+from fontTools.ttLib import TTFont
+from loguru import logger
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../../../")
+
+from format_convert.easyofd.easyofd.parser_ofd import OFDParser
+from format_convert.easyofd.easyofd.draw import DrawPDF, OFDWrite
+
+
+class OFD(object):
+    """ofd对象"""
+
+    def __init__(self, ):
+        self.data = None
+
+    def read(self, ofd_f: Union[str, bytes, BytesIO], fmt="b64", save_xml=False, xml_name="testxml", save_dir=None):
+        """_summary_
+        Args:
+            file (_type_): _description_
+            fomat (str, optional): _description_. Defaults to "path".
+            fomat in ("path","b64","binary")
+        """
+        if fmt == "path":
+            with open(ofd_f, "rb") as f:
+                ofd_f = str(base64.b64encode(f.read()), encoding="utf-8")
+        elif fmt == "b64":
+            pass
+        elif fmt == "binary":
+            ofd_f = str(base64.b64encode(ofd_f), encoding="utf-8")
+        elif fmt == "io":
+            ofd_f = str(base64.b64encode(ofd_f.getvalue()), encoding="utf-8")
+        else:
+            raise "fomat Error: %s" % fmt
+
+        self.data = OFDParser(ofd_f)(save_xml=save_xml, xml_name=xml_name, save_dir=save_dir)
+
+    def save(self, ):
+        """
+        draw ofd xml
+        初始化一个xml 文件
+        self.data > file
+        """
+        assert self.data, f"data is None"
+
+    def pdf2ofd(self, pdfbyte, optional_text=False):
+        """pdf转ofd"""
+        assert pdfbyte, f"pdfbyte is None"
+        # logger.info(f"pdf2ofd")
+        ofd_byte = OFDWrite()(pdfbyte, optional_text=optional_text)
+        return ofd_byte
+
+    def to_pdf(self, return_need_convert_as_image=False):
+        """return ofdbytes"""
+
+        assert self.data, f"data is None"
+        # logger.info(f"to_pdf")
+        obj = DrawPDF(self.data)
+        result = obj()
+        if not return_need_convert_as_image:
+            return result
+        else:
+            return result, obj.page_need_to_image_dict
+
+    def pdf2img(self, pdfbytes):
+
+        image_list = []
+
+        doc = fitz.open(stream=pdfbytes, filetype="pdf")
+
+        for page in doc:
+            rotate = int(0)
+            zoom_x, zoom_y = 1.6, 1.6
+            zoom_x, zoom_y = 2, 2
+            mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
+            pix = page.get_pixmap(matrix=mat, alpha=False)
+            pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            # image = np.ndarray((pix.height, pix.width, 3), dtype=np.uint8, buffer=pix.samples)
+            # print(image.shape)
+            # print(image[2])
+            image_list.append(pil_image)
+        # logger.info(f"pdf2img")
+        return image_list
+
+    def jpg2ofd(self, imglist: list):
+        """
+        imglist: pil image list
+        """
+        ofd_byte = OFDWrite()(pil_img_list=imglist)
+        return ofd_byte
+
+    def jpg2pfd(self, imglist: list):
+        """
+        imglist: PIL image list
+        1 构建data 
+        2 DrawPDF(self.data)()
+        """
+
+        data = OFDParser(None).img2data(imglist)
+        return DrawPDF(data)()
+
+    def to_jpg(self, format="jpg"):
+        """
+        return pil list
+        """
+        assert self.data, f"data is None"
+        image_list = []
+        pdfbytes = self.to_pdf()
+        image_list = self.pdf2img(pdfbytes)
+        return image_list
+
+    def del_data(self, ):
+        """销毁self.data"""
+        self.data = None
+
+    def __del__(self):
+        del self
+
+    def disposal(self, ):
+        """销毁对象"""
+        self.__del__()
+
+
+def find_similar_characters():
+    similar_pairs = []
+    for code in range(0x4E00, 0x9FFF):  # 遍历常见的中文字符范围
+        char = chr(code)
+        try:
+            name = unicodedata.name(char)
+            if name.startswith('CJK COMPATIBILITY IDEOGRAPH'):
+                original_char = unicodedata.lookup(name.split()[-1])
+                similar_pairs.append((original_char, char))
+        except (ValueError, KeyError):
+            continue
+    return similar_pairs
+
+
+def save_chinese_characters(output_path):
+    with open(output_path, 'w', encoding='utf-8') as file:
+        # 遍历更多的中文字符范围
+        # for code in range(0x3400, 0x4DFF + 1):  # CJK Unified Ideographs Extension A
+        #     char = chr(code)
+        #     # if not unicodedata.category(char).startswith('P'):
+        #     file.write(char + '\n')
+        # for code in range(0x4E00, 0x9FFF + 1):  # 常见的中文字符范围
+        #     char = chr(code)
+        #     # if not unicodedata.category(char).startswith('P'):
+        #     file.write(char + '\n')
+        # for code in range(0xF900, 0xFAFF + 1):  # CJK Compatibility Ideographs
+        #     char = chr(code)
+        #     # if not unicodedata.category(char).startswith('P'):
+        #     file.write(char + '\n')
+        # for code in range(0x2F00, 0x2FDF + 1):  # CJK Compatibility Ideographs
+        #     char = chr(code)
+        #     # if not unicodedata.category(char).startswith('P'):
+        #     file.write(char + '\n')
+
+        for code in range(0xF900, 0xFAD9 + 1):  # CJK Compatibility Ideographs
+            char = chr(code)
+            # if not unicodedata.category(char).startswith('P'):
+            file.write(char + '\n')
+
+
+def map_kangxi_to_common_characters(kangxi_start=0x2F00, kangxi_end=0x2FDF, common_start=0x4E00, common_end=0x9FFF, output_path="kangxi_to_common.txt"):
+    with open(output_path, 'w', encoding='utf-8') as file:
+        # 遍历康熙部首范围
+        for kangxi_code in range(kangxi_start, kangxi_end + 1):
+            kangxi_char = chr(kangxi_code)
+            # 遍历常见中文字符范围
+            for common_code in range(common_start, common_end + 1):
+                common_char = chr(common_code)
+                # 如果字形相同,则记录匹配
+                if kangxi_char == common_char:
+                    file.write(f"{kangxi_char} (Kangxi: {hex(kangxi_code)}) -> {common_char} (Common: {hex(common_code)})\n")
+                    break  # 找到匹配后,跳出内层循环
+
+
+if __name__ == "__main__":
+    # ofd = OFD()
+
+    # p = r'D:\Project\format_conversion_maxcompute\format_convert\temp\2b42e0b44cea11f0ab9644f971944973\2b4307ae4cea11f0992a44f971944973_ofd\Doc_0\Res\19.ttf'
+    # font = TTFont(p)  # 替换为你的TTF文件路径
+    # print('font', font.keys())
+    #
+    # # 访问 GlyphOrder 表
+    # glyph_order = font['glyf']
+    # print("Glyph Order:", glyph_order.glyphs)
+    #
+    # # 访问 head 表
+    # head = font['head']
+    # print("Font Head:")
+    # print(f" - Font Magic Number: {head.magicNumber}")
+    # print(f" - Font Version: {head.fontRevision}")
+    # print(f" - Font Flags: {head.flags}")
+    # print(f" - Units per Em: {head.unitsPerEm}")
+    # print(f" - Created: {head.created}")
+    # print(f" - Modified: {head.modified}")
+    #
+    # # 访问 hhea 表
+    # hhea = font['hhea']
+    # print("Horizontal Header:")
+    # print(f" - Ascent: {hhea.ascent}")
+    # print(f" - Descent: {hhea.descent}")
+    # print(f" - Line Gap: {hhea.lineGap}")
+    #
+    # # 访问 maxp 表
+    # maxp = font['maxp']
+    # print("Maximum Profile:")
+    # print(f" - Number of Glyphs: {maxp.numGlyphs}")
+    #
+    # # 访问 OS/2 表
+    # os2 = font['OS/2']
+    # print("OS/2 and Windows Metrics:")
+    # print(f" - Weight Class: {os2.usWeightClass}")
+    # print(f" - Width Class: {os2.usWidthClass}")
+    # print(f" - Type: {os2.fsType}")
+    #
+    # # 访问 hmtx 表
+    # hmtx = font['hmtx']
+    # print("Horizontal Metrics:")
+    # for glyph_name, metrics in hmtx.metrics.items():
+    #     print(f" - Glyph '{glyph_name}': Advance Width = {metrics[0]}, Left Side Bearing = {metrics[1]}")
+    #
+    # # 访问 loca 表
+    # loca = font.get('loca')
+    # print("Locations:")
+    # for i, location in enumerate(loca):
+    #     print(f" - Glyph {i}: {location}")
+    #
+    # # 访问 glyf 表
+    # glyf = font.get('glyf')
+    # for glyph_name in glyf.glyphs:
+    #     glyph = glyf[glyph_name]
+    #     print(f"Glyph '{glyph_name}':")
+    #     print(f" - Number of Contours: {glyph.numberOfContours}")
+    #     if glyph.numberOfContours > 0:
+    #         print(f" - X Minimum: {glyph.xMin}")
+    #         print(f" - Y Minimum: {glyph.yMin}")
+    #         print(f" - X Maximum: {glyph.xMax}")
+    #         print(f" - Y Maximum: {glyph.yMax}")
+    #     else:
+    #         print(" - No Contours")
+    #     print()
+    #
+    # # 访问 name 表
+    # name = font['name']
+    # print("Font Name Entries:")
+    # for record in name.names:
+    #     print(f" - Name ID: {record.nameID}")
+    #     print(f" - Platform ID: {record.platformID}")
+    #     print(f" - Encoding ID: {record.platEncID}")
+    #     print(f" - Language ID: {record.langID}")
+    #     print(f" - Name: {record.toUnicode()}")
+    #     print()
+    # font.close()
+    # print(best_cmap)
+
+    import unicodedata
+    #
+    # # 示例
+    # text = "仁和坪镇杨柳池村⼈居环境整治项⽬终⽌"
+    # standardized_text = unicodedata.normalize('NFD', text)
+    # print(f"标准化后的文本: {standardized_text}")
+
+    # import unicodedata
+    #
+    #
+    #
+    # similar_characters = find_similar_characters()
+    #
+    # for pair in similar_characters:
+    #     print(f"原始字符: {pair[0]}, 兼容字符: {pair[1]}")
+    #
+    # print(f"共找到 {len(similar_characters)} 对相似中文字符。")
+
+
+    # 使用示例
+    output_path = 'chinese_characters.txt'
+    # save_chinese_characters(output_path)
+
+    # 获取并打印 Unicode 编码
+    # char = '⽬'
+    # # char = '目'
+    # print(f"字符 '{char}' 的 Unicode 编码是: {ord(char):04X}")

+ 37 - 0
format_convert/easyofd/easyofd/parser_ofd/__init__.py

@@ -0,0 +1,37 @@
+import os
+import sys
+
+from loguru import logger
+from reportlab.pdfbase import pdfmetrics
+from reportlab.pdfbase.cidfonts import UnicodeCIDFont
+from reportlab.pdfbase.ttfonts import TTFont
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../../../../")
+
+
+# from ofd_parser import *
+
+
+font_map = {"simsun.ttc":["宋体", "SWPMEH+SimSun","SimSun","SWDKON+SimSun"],
+            'simkai.ttf':["KaiTi","楷体","SWLCQE+KaiTi","SWHGME+KaiTi","BWSimKai"],
+            # 'STKAITI.TTF':["华文楷体 常规","STKAITI","华文楷体"],
+            "COURI.TTF":["CourierNewPSMT","CourierNew","SWCRMF+CourierNewPSMT","SWANVV+CourierNewPSMT"],
+            "courbd.TTF":["Courier New"],
+            "simhei.ttf":["SimHei","hei","黑体"]
+            }
+pdfmetrics.registerFont(UnicodeCIDFont('STSong-Light'))
+
+# 初始化字体
+for font,names in font_map.items():
+    for name in names:
+        try:
+            pdfmetrics.registerFont(TTFont(name, font))
+        except:
+            logger.warning(f"FONT  registerFont failed {font}: {name}")
+
+from format_convert.easyofd.easyofd.parser_ofd.ofd_parser import OFDParser
+__all__=["OFDParser"]
+                                    
+
+
+

+ 145 - 0
format_convert/easyofd/easyofd/parser_ofd/file_annotation_parser.py

@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_annotation_parser.py
+# CREATE_TIME: 2025/3/28 14:12
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: 注释解析
+import re
+
+from loguru import logger
+from .file_parser_base import FileParserBase
+
+
+# class AnnotationsParser(FileParserBase):
+#     """
+#     Parser Annotations
+#     注释信息-总
+#     /xml_dir/Doc_0/Pages/Page_0/Content.xml
+#     """
+#
+#     def __call__(self):
+#         info = {}
+#         annotations_res: list = []
+#         annotations_res_key = "ofd:Page"
+#         self.recursion_ext(self.xml_obj, annotations_res, annotations_res_key)
+#         # logger.debug(f"annotations_res is {annotations_res}")
+#         if annotations_res:
+#             for i in annotations_res:
+#                 page_id = i.get("@PageID")
+#                 if not page_id:
+#                     # logger.debug(f"page_id is null ")
+#                     continue
+#                 file_Loc = i.get("ofd:FileLoc")
+#                 if not file_Loc:
+#                     # logger.debug(f"file_Loc is null ")
+#                     continue
+#                 info[page_id] = {
+#                     "FileLoc": file_Loc,
+#                 }
+#
+#         return info
+#
+#
+# class AnnotationFileParser(FileParserBase):
+#     """
+#     Parser Annotation
+#     注释类 包含 签名注释 水印注释 信息注释
+#     """
+#
+#     AnnoType = {
+#         "Watermark": {
+#             "name": "水印",
+#             "type": "Watermark"
+#         },
+#         "Link": {
+#             "name": "链接",
+#             "type": "Link"
+#         }
+#         ,
+#         "Path": {
+#             "name": "路径",
+#             "type": "Path"
+#         },
+#         "Highlight": {
+#             "name": "高亮",
+#             "type": "Highlight"
+#         },
+#         "Stamp": {
+#             "name": "签章",
+#             "type": "Highlight"
+#         }
+#     }
+#
+#     def normalize_font_name(self, font_name):
+#         """将字体名称规范化,例如 'Times New Roman Bold' -> 'TimesNewRoman-Bold'"""
+#         # 替换空格为无,并将样式(Bold/Italic等)用连字符连接
+#         if not isinstance(font_name, str):
+#             return ""
+#         normalized = font_name.replace(' ', '')
+#         # 处理常见的样式后缀
+#         for style in ['Bold', 'Italic', 'Regular', 'Light', 'Medium', ]:
+#             if style in normalized:
+#                 normalized = normalized.replace(style, f'-{style}')
+#
+#         # todo 特殊字体名规范 后续存在需要完善
+#         if normalized == "TimesNewRoman":
+#             normalized = normalized.replace("TimesNewRoman", "Times-Roman")
+#         return normalized
+#
+#     def __call__(self):
+#         info = {}
+#         public_res: list = []
+#         public_res_key = "ofd:Page"
+#         self.recursion_ext(self.xml_obj, public_res, public_res_key)
+#
+#         if public_res:
+#             for i in public_res:
+#                 info[i.get("@ID")] = {
+#                     "FontName": self.normalize_font_name(i.get("@FontName")),
+#                     "FontNameORI": i.get("@FontName"),
+#                     "FamilyName": self.normalize_font_name(i.get("@FamilyName")),
+#                     "FamilyNameORI": i.get("@FamilyName"),
+#                     "Bold": i.get("@Bold"),
+#                     "Serif": i.get("@Serif"),
+#                     "FixedWidth": i.get("@FixedWidth"),
+#                     "FontFile": i.get("ofd:FontFile"),
+#                 }
+#         return info
+
+
+class AnnotationFileParser(FileParserBase):
+    """
+    Annotations.xml 为doc内的根节点 包含:
+    1 文件的路径
+
+    /xml_dir/Doc_0/Annotations.xml
+    """
+
+    def loc2page_no(self, loc, idx):
+        pg_no = re.search(r"\d+", loc)
+        if pg_no:
+            pg_no = int(pg_no.group())
+        else:
+            pg_no = idx
+        return pg_no
+
+    def __call__(self):
+        annot_info = {}
+
+        # ofd:Page 正文
+        page: list = []
+        page_id_map = {}
+        page_key = "ofd:Page"
+        self.recursion_ext(self.xml_obj, page, page_key)
+        if page:
+            # print('page', page)
+            page_id_map = {
+                i.get("@PageID"): self.loc2page_no(i.get("ofd:FileLoc"), idx)
+                for idx, i in enumerate(page)
+            }
+            page = [i.get("ofd:FileLoc") if isinstance(i, dict) else i for i in page]
+
+        annot_info["annot_page"] = page
+        annot_info["annot_page_id_map"] = page_id_map
+        return annot_info

+ 7 - 0
format_convert/easyofd/easyofd/parser_ofd/file_attachment_parser.py

@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_attachment_parser.py
+# CREATE_TIME: 2025/4/9 18:52
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE:

+ 140 - 0
format_convert/easyofd/easyofd/parser_ofd/file_content_parser.py

@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_content_parser.py
+# CREATE_TIME: 2025/3/28 11:47
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: 解析正文
+from loguru import  logger
+from .file_parser_base import FileParserBase
+
+
+class ContentFileParser(FileParserBase):
+    """
+    Parser Contents&tpls
+    /xml_dir/Doc_0/Doc_0/Pages/Page_0/Content.xml
+    """
+
+    def fetch_cell_info(self, row, TextObject):
+        """fetch_cell_info"""
+        cell_d = {}
+        cell_d = {}
+        cell_d["ID"] = row['@ID']  # 字体
+        # 字体字形信息
+        if row.get("ofd:CGTransform"):
+            Glyphs_d = {
+                "Glyphs": row.get("ofd:CGTransform").get("ofd:Glyphs"),
+                "GlyphCount": row.get("ofd:CGTransform").get("@GlyphCount"),
+                "CodeCount": row.get("ofd:CGTransform").get("@CodeCount"),
+                "CodePosition": row.get("ofd:CGTransform").get("@CodePosition")
+            }
+            cell_d["Glyphs_d"] = Glyphs_d
+
+        cell_d["pos"] = [float(pos_i) for pos_i in row['@Boundary'].split(" ")]  # 文本框
+        if row.get('ofd:Clips', {}).get('ofd:Clip', {}).get('ofd:Area', {}).get('ofd:Path', {}):
+            try:
+                cell_d["clips_pos"] = [float(pos_i) for pos_i in
+                                       row.get('ofd:Clips', {})
+                                           .get('ofd:Clip', {})
+                                           .get('ofd:Area', {})
+                                           .get('ofd:Path', {})
+                                           .get('@Boundary', "")
+                                           .split(" ")]
+            except:
+                pass
+        cell_d["text"] = str(TextObject.get('#text'))
+        cell_d["font"] = row['@Font']  # 字体
+        cell_d["size"] = float(row['@Size'])  # 字号
+        # print("row", row)
+
+        color = self.ofd_param("ofd:FillColor", row).get("@Value", "0 0 0")
+
+        cell_d["color"] = tuple(color.split(" "))  # 颜色
+        cell_d["DeltaY"] = TextObject.get("@DeltaY", "")  # y 轴偏移量 竖版文字表示方法之一
+        cell_d["DeltaX"] = TextObject.get("@DeltaX", "")  # x 轴偏移量
+        cell_d["CTM"] = row.get("@CTM", "")  # 平移矩阵换
+
+        cell_d["X"] = TextObject.get("@X", "")  # X 文本之与文本框距离
+        cell_d["Y"] = TextObject.get("@Y", "")  # Y 文本之与文本框距离
+        return cell_d
+
+    def __call__(self) -> list:
+        """
+
+        输出主体坐标和文字信息 cell_list
+        [{"pos":row['@Boundary'].split(" "),
+                    "text":row['ofd:TextCode'].get('#text'),
+                    "font":row['@Font'],
+                    "size":row['@Size'],}]
+        """
+        text_list = []
+        img_list = []
+        line_list = []
+
+        content_d = {
+            "text_list": text_list,
+            "img_list": img_list,
+            "line_list": line_list,
+        }
+
+        text: list = []  # 正文
+        text_key = "ofd:TextObject"
+        self.recursion_ext(self.xml_obj, text, text_key)
+
+        if text:
+            for row in text:
+                # print("row", row.get('ofd:TextCode', {}))
+                if isinstance(row.get('ofd:TextCode', {}), list):
+                    for _i in row.get('ofd:TextCode', {}):
+                        if not _i.get('#text'):
+                            continue
+                        cell_d = self.fetch_cell_info(row, _i)
+                        text_list.append(cell_d)
+
+                elif isinstance(row.get('ofd:TextCode', {}), dict):
+                    if not row.get('ofd:TextCode', {}).get('#text'):
+                        continue
+                    cell_d = self.fetch_cell_info(row, row.get('ofd:TextCode', {}))
+                    text_list.append(cell_d)
+
+                else:
+                    logger.error(f"'ofd:TextCode' format nonsupport  {row.get('ofd:TextCode', {})}")
+                    continue
+
+        line: list = []  # 路径线条
+        line_key = "ofd:PathObject"
+        self.recursion_ext(self.xml_obj, line, line_key)
+
+        if line:
+            # print(line)
+            for _i in line:
+                line_d = {}
+                # print("line",_i)
+                try:
+                    line_d["ID"] = _i.get("@ID", "")  # 图片id
+                    line_d["pos"] = [float(pos_i) for pos_i in _i['@Boundary'].split(" ")]  # 平移矩阵换
+                    line_d["LineWidth"] = _i.get("@LineWidth", "")  # 图片id
+                    line_d["AbbreviatedData"] = _i.get("ofd:AbbreviatedData", "")  # 路径指令
+                    line_d["FillColor"] = self.ofd_param("ofd:FillColor", _i).get('@Value', "0 0 0").split(" ")  # 颜色
+                    line_d["StrokeColor"] = self.ofd_param("ofd:StrokeColor", _i).get('@Value', "0 0 0")  # 颜色
+                    line_d["CTM"] = _i.get("@CTM", "")  # 平移矩阵换
+                except KeyError as e:
+                    logger.error(f"{e} \n line is {_i} \n")
+                    continue
+                line_list.append(line_d)
+
+        img: list = []  # 图片
+        img_key = "ofd:ImageObject"
+        self.recursion_ext(self.xml_obj, img, img_key)
+
+        if img:
+            for _i in img:
+                img_d = {}
+                img_d["CTM"] = _i.get("@CTM", "")  # 平移矩阵换
+                img_d["ID"] = _i.get("ID", "")  # 图片id
+                img_d["ResourceID"] = _i.get("@ResourceID", "")  # 图片id
+                img_d["pos"] = [float(pos_i) for pos_i in _i['@Boundary'].split(" ")]  # 平移矩阵换
+                img_list.append(img_d)
+
+        return content_d
+

+ 7 - 0
format_convert/easyofd/easyofd/parser_ofd/file_customtag_parser.py

@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_customtag_parser.py
+# CREATE_TIME: 2025/4/9 18:51
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE:

+ 104 - 0
format_convert/easyofd/easyofd/parser_ofd/file_deal.py

@@ -0,0 +1,104 @@
+# coding: utf-8
+#!/usr/bin/env python
+#-*- coding: utf-8 -*-
+#PROJECT_NAME: D:\code\easyofd\easyofd\parser
+#CREATE_TIME: 2023-07-27 
+#E_MAIL: renoyuan@foxmail.com
+#AUTHOR: reno 
+#NOTE:  文件处理
+import os
+import base64
+import shutil
+from typing import Any
+from uuid import uuid1
+
+import xmltodict
+import zipfile
+from loguru import logger
+
+from .path_parser import PathParser
+
+
+class FileRead(object):
+    """
+    文件读取,清除
+    'root': OFD.xml 
+    "root_doc" Doc_0/Document.xml
+    xml_path : xml_obj
+    other_path : b64string
+    """
+    def __init__(self, ofdb64:str):
+
+        self.ofdbyte = base64.b64decode(ofdb64) 
+        pid=os.getpid()
+        self.name = f"{pid}_{str(uuid1())}.ofd"
+        self.pdf_name = self.name.replace(".ofd",".pdf")
+        self.zip_path = f"{os.getcwd()}/{self.name}"
+        self.unzip_path = ""
+        self.file_tree = {}
+    
+    def unzip_file(self, unzip_dir=None):
+        """
+        :param zip_path: ofd格式文件路径
+        :param unzip_path: 解压后的文件存放目录
+        :return: unzip_path
+        """
+        if unzip_dir is None:
+            self.unzip_path = self.zip_path.split('.')[0]
+            self.zip_path = f"{os.getcwd()}/{self.name}"
+        else:
+            self.unzip_path = unzip_dir
+            self.zip_path = f"{unzip_dir}{self.name}"
+        print('ofd self.unzip_path', self.unzip_path)
+        print('ofd self.zip_path', self.zip_path)
+
+        with open(self.zip_path,"wb") as f:
+            f.write(self.ofdbyte)
+
+        with zipfile.ZipFile(self.zip_path, 'r') as f:
+            for file in f.namelist():
+                # print('file', file)
+                # 跳过附件,在显示中不展示
+                if 'Attachs' in file:
+                    continue
+                f.extract(file, path=self.unzip_path)
+        if self.save_xml:
+            print("saving xml {}".format(self.xml_name))
+            with zipfile.ZipFile(self.zip_path, 'r') as f:
+                for file in f.namelist():
+                    f.extract(file, path=self.xml_name)
+       
+    def buld_file_tree(self):
+        "xml读取对象其他b64"
+        self.file_tree["root"] = self.unzip_path
+        self.file_tree["pdf_name"] = self.pdf_name
+        for root, dirs, files in os.walk(self.unzip_path):
+            for file in files:
+                
+                abs_path = os.path.join(root,file)
+                # 资源文件 则 b64 xml 则  xml——obj
+                self.file_tree[abs_path] = str(base64.b64encode(open(f"{abs_path}","rb").read()),"utf-8")  \
+                    if "xml" not in file else xmltodict.parse(open(f"{abs_path}" , "r", encoding="utf-8").read())
+        self.file_tree["root_doc"] = os.path.join(self.unzip_path,"OFD.xml") if os.path.join(self.unzip_path,"OFD.xml") in self.file_tree else ""
+  
+        # if os.path.exists(self.unzip_path):
+        #     shutil.rmtree(self.unzip_path)
+       
+        # if os.path.exists(self.zip_path):
+        #     os.remove(self.zip_path)
+                   
+    def __call__(self, *args: Any, **kwds: Any) -> Any:
+        self.save_xml=kwds.get("save_xml",False)
+        self.xml_name=kwds.get("xml_name")
+        self.save_dir = kwds.get('save_dir')
+    
+        self.unzip_file(self.save_dir)
+        self.buld_file_tree()
+        return self.file_tree 
+
+
+if __name__ == "__main__":
+    with open(r"D:/code/easyofd/test/增值税电子专票5.ofd","rb") as f:
+        ofdb64 = str(base64.b64encode(f.read()),"utf-8")
+    a = FileRead(ofdb64)()
+    print(list(a.keys()))

+ 99 - 0
format_convert/easyofd/easyofd/parser_ofd/file_doc_parser.py

@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_doc_parser.py
+# CREATE_TIME: 2025/3/28 11:46
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: 解析document
+
+import  re
+
+from .file_parser_base import FileParserBase
+
+
+
+class DocumentFileParser(FileParserBase):
+    """
+    Document 为doc内的根节点 包含:
+    1 文件的路径 2 doc的size
+
+    /xml_dir/Doc_0/Document.xml
+    """
+
+    def loc2page_no(self, loc, idx):
+        pg_no = re.search(r"\d+", loc)
+        if pg_no:
+            pg_no = int(pg_no.group())
+        else:
+            pg_no = idx
+        return pg_no
+
+    def __call__(self):
+        document_info = {}
+
+        # size
+        physical_box: list = []
+        physical_box_key = "ofd:PhysicalBox"
+        self.recursion_ext(self.xml_obj, physical_box, physical_box_key)
+        document_info["size"] = physical_box[0] if physical_box else ""
+
+        # ofd:PublicRes路径 包含字体路径信息
+        public_res: list = []
+        public_res_key = "ofd:PublicRes"
+        self.recursion_ext(self.xml_obj, public_res, public_res_key)
+        document_info["public_res"] = public_res
+
+        # ofd:DocumentRes路径  包含静态资源图片
+        document_res: list = []
+        document_res_key = "ofd:DocumentRes"
+        self.recursion_ext(self.xml_obj, document_res, document_res_key)
+        document_info["document_res"] = document_res
+
+        # tpls
+        tpls: list = []
+        template_page_key = "ofd:TemplatePage"
+        self.recursion_ext(self.xml_obj, tpls, template_page_key)
+        if tpls:
+            tpls = [i.get("@BaseLoc") if isinstance(i, dict) else i for i in tpls]
+        document_info["tpls"] = tpls
+
+        # ofd:Page 正文
+        page: list = []
+        page_id_map = {}
+        page_key = "ofd:Page"
+        self.recursion_ext(self.xml_obj, page, page_key)
+        if page:
+            page_id_map = {
+                i.get("@ID"): self.loc2page_no(i.get("@BaseLoc"), idx)
+                for idx, i in enumerate(page)
+            }
+            page = [i.get("@BaseLoc") if isinstance(i, dict) else i for i in page]
+
+        document_info["page"] = page
+        document_info["page_id_map"] = page_id_map
+
+        # ofd:Annotations
+        annotations: list = []
+        annotations_key = "ofd:Annotations"
+        self.recursion_ext(self.xml_obj, annotations, annotations_key)
+        document_info["Annotations"] = annotations
+
+        # ofd:Attachments
+        attachments: list = []
+        attachments_key = "ofd:Attachments"
+        self.recursion_ext(self.xml_obj, attachments, attachments_key)
+        document_info["attachments"] = attachments
+
+        # ofd:CustomTags
+        custom_tag: list = []
+        custom_tag_key = "ofd:CustomTags"
+        self.recursion_ext(self.xml_obj, custom_tag, custom_tag_key)
+        document_info["custom_tag"] = custom_tag
+
+        return document_info
+
+
+
+
+
+

+ 36 - 0
format_convert/easyofd/easyofd/parser_ofd/file_docres_parser.py

@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_docres_parser.py
+# CREATE_TIME: 2025/3/28 11:48
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: 解析 DocumentRes
+
+import os
+
+from .file_parser_base import FileParserBase
+
+class DocumentResFileParser(FileParserBase):
+    """
+    Parser DocumentRes 抽取里面图片信息
+    /xml_dir/Doc_0/DocumentRes.xml
+    /xml_dir/Doc_0/PublicRes.xml
+    """
+
+    def __call__(self):
+        info = {}
+        muti_media: list = []
+        muti_media_key = "ofd:MultiMedia"
+        self.recursion_ext(self.xml_obj, muti_media, muti_media_key)
+        if muti_media:
+            for media in muti_media:
+                name = media.get("ofd:MediaFile", "")
+                info[media.get("@ID")] = {
+                    "format": media.get("@Format", ""),
+                    "wrap_pos": media.get("@wrap_pos", ""),
+                    # "Boundary": media.get("@Boundary", ""),
+                    "type": media.get("@Type", ""),
+                    "suffix": os.path.splitext(name)[-1].replace(".", ""),  # 文件后缀名
+                    "fileName": name,
+                }
+        return info

+ 41 - 0
format_convert/easyofd/easyofd/parser_ofd/file_ofd_parser.py

@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_ofd_parser.py
+# CREATE_TIME: 2025/3/28 11:45
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: 解析OFD
+from .file_parser_base import FileParserBase
+
+class OFDFileParser(FileParserBase):
+    """
+    Parser OFD 文件
+    /xml_dir/OFD.xml
+    """
+    def __call__(self):
+        info = {}
+        # DocRoot
+        doc_root: list = []
+        doc_root_key = "ofd:DocRoot"
+        # print(self.xml_obj,doc_root)
+        self.recursion_ext(self.xml_obj, doc_root, doc_root_key)
+        info["doc_root"] = doc_root
+
+        signatures: list = []
+        signatures_key = "ofd:Signatures"
+        self.recursion_ext(self.xml_obj, signatures, signatures_key)
+        info["signatures"] = signatures
+
+        # ofd:Creator
+        creator: list = []
+        creator_key = "ofd:Creator"
+        self.recursion_ext(self.xml_obj, creator, creator_key)
+        info["creator"] = creator
+
+        # ofd:CreationDate
+        reation_date: list = []
+        creation_date_key = "ofd:CreationDate"
+        self.recursion_ext(self.xml_obj, reation_date, creation_date_key)
+        info["creationDate"] = reation_date
+
+        return info

+ 58 - 0
format_convert/easyofd/easyofd/parser_ofd/file_parser.py

@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: D:\code\easyofd\easyofd\parser
+# CREATE_TIME: 2023-07-27
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: 每种类型的文件定义一个解析器
+
+import sys
+
+sys.path.insert(0, "..")
+import logging
+import os
+import traceback
+import base64
+import re
+from typing import Any
+from .parameter_parser import ParameterParser
+logger = logging.getLogger("root")
+
+
+class FileParserBase(object):
+    """xml解析"""
+
+    def __init__(self, xml_obj):
+        assert xml_obj
+        self.ofd_param = ParameterParser()
+        self.xml_obj = xml_obj
+        # print(xml_obj)
+
+    def recursion_ext(self, need_ext_obj, ext_list, key):
+        """
+        抽取需要xml要素
+        need_ext_obj : xmltree
+        ext_list: data container
+        key: key
+        """
+        if isinstance(need_ext_obj, dict):
+            for k, v in need_ext_obj.items():
+                if k == key:
+                    if isinstance(v, (dict, str)):
+                        ext_list.append(v)
+                    elif isinstance(v, list):
+                        ext_list.extend(v)
+                else:
+                    if isinstance(v, dict):
+                        self.recursion_ext(v, ext_list, key)
+                    elif isinstance(v, list):
+                        for cell in v:
+                            self.recursion_ext(cell, ext_list, key)
+                    else:
+                        pass
+        else:
+            print(type(need_ext_obj))
+
+
+if __name__ == "__main__":
+    FileParserBase("")()

+ 63 - 0
format_convert/easyofd/easyofd/parser_ofd/file_parser_base.py

@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_parser_base.py
+# CREATE_TIME: 2025/3/28 11:43
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: base 解析器
+
+import sys
+
+sys.path.insert(0, "..")
+import logging
+import os
+import traceback
+import base64
+import re
+from typing import Any
+from .parameter_parser import ParameterParser
+logger = logging.getLogger("root")
+
+
+class FileParserBase(object):
+    """xml解析"""
+
+    def __init__(self, xml_obj):
+        assert xml_obj
+        self.ofd_param = ParameterParser()
+        self.xml_obj = xml_obj
+        # print(xml_obj)
+
+    def recursion_ext(self, need_ext_obj, ext_list, key):
+        """
+        抽取需要xml要素
+        need_ext_obj : xmltree
+        ext_list: data container
+        key: key
+        """
+
+        if isinstance(need_ext_obj, dict):
+
+            for k, v in need_ext_obj.items():
+                if k == key:
+
+                    if isinstance(v, (dict, str)):
+                        ext_list.append(v)
+                    elif isinstance(v, list):
+                        ext_list.extend(v)
+
+
+                else:
+
+                    if isinstance(v, dict):
+                        self.recursion_ext(v, ext_list, key)
+                    elif isinstance(v, list):
+                        for cell in v:
+                            self.recursion_ext(cell, ext_list, key)
+                    else:
+
+                        pass
+        else:
+
+            print(type(need_ext_obj))
+

+ 52 - 0
format_convert/easyofd/easyofd/parser_ofd/file_publicres_parser.py

@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_publicres_parser.py
+# CREATE_TIME: 2025/3/28 11:49
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: PublicResFileParser
+
+from .file_parser_base import FileParserBase
+
+
+class PublicResFileParser(FileParserBase):
+    """
+    Parser PublicRes 抽取里面 获取公共信息 字体信息
+    /xml_dir/Doc_0/PublicRes.xml
+    """
+
+    def normalize_font_name(self, font_name):
+        """将字体名称规范化,例如 'Times New Roman Bold' -> 'TimesNewRoman-Bold'"""
+        # 替换空格为无,并将样式(Bold/Italic等)用连字符连接
+        if not isinstance(font_name, str):
+            return ""
+        normalized = font_name.replace(' ', '')
+        # 处理常见的样式后缀
+        for style in ['Bold', 'Italic', 'Regular', 'Light', 'Medium', ]:
+            if style in normalized:
+                normalized = normalized.replace(style, f'-{style}')
+
+        # todo 特殊字体名规范 后续存在需要完善
+        if normalized == "TimesNewRoman":
+            normalized = normalized.replace("TimesNewRoman", "Times-Roman")
+        return normalized
+
+    def __call__(self):
+        info = {}
+        public_res: list = []
+        public_res_key = "ofd:Font"
+        self.recursion_ext(self.xml_obj, public_res, public_res_key)
+
+        if public_res:
+            for i in public_res:
+                info[i.get("@ID")] = {
+                    "FontName": self.normalize_font_name(i.get("@FontName")),
+                    "FontNameORI": i.get("@FontName"),
+                    "FamilyName": self.normalize_font_name(i.get("@FamilyName")),
+                    "FamilyNameORI": i.get("@FamilyName"),
+                    "Bold": i.get("@Bold"),
+                    "Serif": i.get("@Serif"),
+                    "FixedWidth": i.get("@FixedWidth"),
+                    "FontFile": i.get("ofd:FontFile"),
+                }
+        return info

+ 63 - 0
format_convert/easyofd/easyofd/parser_ofd/file_signature_parser.py

@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  file_signature_parser.py
+# CREATE_TIME: 2025/3/28 14:13
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: 签章解析
+
+from .file_parser_base import FileParserBase
+
+class SignaturesFileParser(FileParserBase):
+    """
+    Parser Signatures
+    签章信息-总
+    /xml_dir/Doc_0/PublicRes.xml
+    """
+
+    def __call__(self):
+        info = {}
+        signature_res: list = []
+        signature_res_key = "ofd:Signature"
+        self.recursion_ext(self.xml_obj, signature_res, signature_res_key)
+
+        if signature_res:
+            for i in signature_res:
+                info[i.get("@ID")] = {
+                    "BaseLoc": i.get("@BaseLoc"),
+                    "Type": i.get("@Type"),
+                    "ID": i.get("@ID"),
+
+                }
+        return info
+
+
+class SignatureFileParser(FileParserBase):
+    """
+    Parser Signature
+    签章信息
+    """
+
+    def __call__(self, prefix=""):
+        info = {}
+        StampAnnot_res: list = []
+        StampAnnot_res_key = "ofd:StampAnnot"
+
+        self.recursion_ext(self.xml_obj, StampAnnot_res, StampAnnot_res_key)
+
+        SignedValue_res: list = []
+        SignedValue_res_key = "ofd:SignedValue"
+        self.recursion_ext(self.xml_obj, SignedValue_res, SignedValue_res_key)
+
+        # print("SignedValue_res", SignedValue_res)
+        # print("prefix", prefix)
+        if StampAnnot_res:
+            for i in StampAnnot_res:
+                info = {
+                    "PageRef": i.get("@PageRef"),  # page id
+                    "Boundary": i.get("@Boundary"),
+                    "ID": i.get("@ID"),
+                    "SignedValue": f"{prefix}/{SignedValue_res[0]}" if SignedValue_res else f"{prefix}/SignedValue.dat",
+                }
+
+        return info

+ 100 - 0
format_convert/easyofd/easyofd/parser_ofd/find_seal_img.py

@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: easyofd read_seal_img
+# CREATE_TIME: 2024/5/28 14:13
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: renoyuan
+# note: 根据 ASN.1 解析签章 拿到 签章图片
+import io
+
+from PIL import Image, UnidentifiedImageError
+from loguru import logger
+from pyasn1.codec.der.decoder import decode
+from pyasn1.type import univ
+from pyasn1.error import PyAsn1Error
+
+
+
+class SealExtract(object):
+    def __init__(self,):
+        pass
+    def read_signed_value(self, path):
+        # 读取二进制文件
+        with open(path, 'rb') as file:
+            binary_data = file.read()
+        # 尝试解码为通用的 ASN.1 结构
+        try:
+            decoded_data, _ = decode(binary_data)
+        except PyAsn1Error as e:
+            # print(f"Decoding failed: {e}")
+            decoded_data = None
+        finally:
+           return  decoded_data
+
+
+    def find_octet_strings(self, asn1_data,octet_strings:list):
+
+        # 递归查找所有的 OctetString 实例
+
+        if isinstance(asn1_data, univ.OctetString):
+
+            octet_strings.append(asn1_data)
+        elif isinstance(asn1_data, univ.Sequence) or isinstance(asn1_data, univ.Set):
+            for component in asn1_data:
+                self.find_octet_strings(asn1_data[f"{component}"], octet_strings)
+        elif isinstance(asn1_data, univ.Choice):
+            self.find_octet_strings(asn1_data.getComponent(), octet_strings)
+        elif isinstance(asn1_data, univ.Any):
+            try:
+                sub_data, _ = decode(asn1_data.asOctets())
+                self.find_octet_strings(sub_data, octet_strings)
+            except PyAsn1Error:
+                pass
+
+
+    def hex_to_image(self, hex_data, image_format='PNG',inx=0):
+        """
+        将16进制数据转换为图片并保存。
+
+        :param hex_data: 图片的16进制数据字符串
+        :param image_format: 图片的格式,默认为'PNG'
+        """
+        # 将16进制数据转换为二进制数据
+
+        binary_data = bytes.fromhex(hex_data)
+
+        # 创建BytesIO对象以读取二进制数据
+        image_stream = io.BytesIO(binary_data)
+
+        # 使用Pillow打开图像数据并保存
+        try:
+            image = Image.open(image_stream)
+            # image.save(f'{inx}_image.{image_format}', format=image_format)
+            # print(f"图片已保存为'image.{image_format}'")
+            return image
+        except UnidentifiedImageError:
+            pass
+            # logger.info("not img ")
+
+    def __call__(self, path):
+        decoded_data = self.read_signed_value(path)
+        octet_strings = []
+        img_list = []  # 目前是只有一个的,若存在多个的话关联后面考虑
+        if decoded_data:
+            self.find_octet_strings(decoded_data, octet_strings)
+
+            for i, octet_string in enumerate(octet_strings):
+
+                if str(octet_string.prettyPrint()).startswith("0x"):
+
+                    img = self.hex_to_image(str(octet_string.prettyPrint())[2:],inx= i)
+                    if img:
+                        img_list.append(img)
+        else:
+            pass
+            # logger.info("No valid ASN.1 data found.")
+        return  img_list
+
+if __name__=="__main__":
+    print(SealExtract()(r"F:\code\easyofd\test\1111_xml\Doc_0\Signs\Sign_0\SignedValue.dat" ))
+

+ 35 - 0
format_convert/easyofd/easyofd/parser_ofd/img_deal.py

@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: easyofd img_deal
+# CREATE_TIME: 2024/7/18 11:20
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: renoyuan
+# note: img 操作
+from io import BytesIO
+class DealImg(object):
+    def __init__(self):
+        pass
+    def resize(self):
+        """resize img"""
+        pass
+    def pil2bytes(self, image):
+        """pil2bytes"""
+        # 创建一个 BytesIO 对象
+        img_bytesio = BytesIO()
+        # 将图像保存到 BytesIO 对象
+        image.save(img_bytesio, format='PNG')  # 你可以根据需要选择其他图像格式
+        # 获取 BytesIO 对象中的字节
+        img_bytes = img_bytesio.getvalue()
+        # 关闭 BytesIO 对象
+        img_bytesio.close()
+        return img_bytes
+    def pil2bytes_io(self, image):
+        """pil2bytes_io"""
+        # 创建一个 BytesIO 对象
+        img_bytesio = BytesIO()
+        # 将图像保存到 BytesIO 对象
+        image.save(img_bytesio, format='PNG')  # 你可以根据需要选择其他图像格式
+        return img_bytesio
+
+
+

+ 607 - 0
format_convert/easyofd/easyofd/parser_ofd/ofd_parser.py

@@ -0,0 +1,607 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: D:\code\easyofd\easyofd\parser
+# CREATE_TIME: 2023-07-27
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE: ofd解析主流程
+
+import os
+import sys
+sys.path.append(os.path.dirname(__file__) + "/../../../../")
+from format_convert.easyofd.easyofd.parser_ofd.file_ofd_parser import OFDFileParser
+from jbig2_parser import jbig2_parser
+import traceback
+import base64
+import re
+import io
+# import jbigkit
+from typing import Any, List
+from PIL import Image
+from PIL.Image import Image as ImageClass
+from loguru import logger
+
+from format_convert.easyofd.easyofd.parser_ofd.img_deal import DealImg
+from format_convert.easyofd.easyofd.parser_ofd.file_deal import FileRead
+from format_convert.easyofd.easyofd.parser_ofd.file_ofd_parser import OFDFileParser
+from format_convert.easyofd.easyofd.parser_ofd.file_doc_parser import DocumentFileParser
+from format_convert.easyofd.easyofd.parser_ofd.file_docres_parser import DocumentResFileParser
+from format_convert.easyofd.easyofd.parser_ofd.file_content_parser import ContentFileParser
+from format_convert.easyofd.easyofd.parser_ofd.file_annotation_parser import AnnotationFileParser
+from format_convert.easyofd.easyofd.parser_ofd.file_publicres_parser import PublicResFileParser
+from format_convert.easyofd.easyofd.parser_ofd.file_signature_parser import SignaturesFileParser,SignatureFileParser
+from format_convert.easyofd.easyofd.parser_ofd.path_parser import PathParser
+# todo 解析流程需要大改
+
+
+class OFDParser(object):
+    """
+    OFDParser 解析
+    1 解压文件 创建文件映射表 释放文件
+    2 解析 xml 逐级去 收集需要信息  结构文本 以及 资源
+    2 调用font 注册 字体
+
+    图层顺序 tlp>content>annotation
+    """
+
+    def __init__(self, ofdb64):
+        self.img_deal = DealImg()
+        self.ofdb64 = ofdb64
+        self.file_tree = None
+        # self.jbig2dec_path = r"C:/msys64/mingw64/bin/jbig2dec.exe"
+        self.jbig2dec_path = r'D:\Anaconda3\pkgs\jbig2dec-0.18-ha9979f8_0\Library\bin\jbig2dec.exe'
+
+    def img2data(self, imglist: List[ImageClass]):
+        """
+        imglist to ofd data
+        
+        """
+        OP = 200 / 25.4
+        doc_list = []
+        img_info = {}
+        page_size = []
+        font_info = {}
+        page_info_d = {}
+
+        for idx, img_pil in enumerate(imglist):
+            w, h = img_pil.size
+            img_bytes = self.img_deal.pil2bytes(img_pil)
+            imgb64 = str(base64.b64encode(img_bytes), encoding="utf-8")
+            img_info[str(idx)] = {
+                "format": "jpg",
+                "wrap_pos": "",
+                "type": "IMG",
+                "suffix": "jpg",
+                "fileName": f"{idx}.jpg",
+                "imgb64": imgb64,
+
+            }
+            text_list = []
+            img_list = []
+            img_d = {}
+            img_d["CTM"] = ""  # 平移矩阵换 平移 缩放 旋转
+            img_d["ID"] = str(idx)  # 图片id
+            img_d["ResourceID"] = str(idx)  # 图片id
+            img_d["pos"] = [0, 0, w / OP, h / OP]  # 平移矩阵换
+            page_size = [0, 0, w / OP, h / OP]
+            # print(page_size)
+            img_list.append(img_d)
+
+            content_d = {
+                "text_list": text_list,
+                "img_list": img_list,
+            }
+            page_info_d[idx] = content_d
+        doc_list.append({
+            "pdf_name": "demo.pdf",
+            "doc_no": "0",
+            "images": img_info,
+            "page_size": page_size,
+            "fonts": font_info,
+            "page_info": page_info_d
+        })
+
+        return doc_list
+
+    # 获得xml 对象
+    def get_xml_obj(self, label):
+        assert label
+        # print(self.file_tree.keys())
+        label =label.lstrip('./')
+        for abs_p in self.file_tree:
+            # 统一符号,避免win linux 路径冲突
+
+            abs_p_compare = abs_p.replace("\\\\", "-").replace("//", "-").replace("\\", "-").replace("/", "-")
+            label_compare = label.replace("\\\\", "-").replace("//", "-").replace("\\", "-").replace("/", "-")
+            if label_compare in abs_p_compare:
+                # logger.info(f"{label} {abs_p}")
+                return self.file_tree[abs_p]
+        # logger.info(f"{label} ofd file path is not")
+        return ""
+
+    def jb22png_old(self, img_d: dict):
+        """
+        jb22png
+        没有安装 jbig2dec 无法操作 
+        """
+        if not os.path.exists(self.jbig2dec_path):
+            logger.warning(f"未安装jbig2dec,无法处理jb2文件")
+            return
+
+        # todo ib2 转png C:/msys64/mingw64/bin/jbig2dec.exe -o F:\code\easyofd\test\image_80.png F:\code\easyofd\test\image_80.jb2
+        fileName = img_d["fileName"]
+        print('jb2 file_name', fileName)
+        new_fileName = img_d['fileName'].replace(".jb2", ".png")
+        with open(fileName, "wb") as f:
+            f.write(base64.b64decode(img_d["imgb64"]))
+        command = "{} -o {} {}"
+        res = os.system(command.format(self.jbig2dec_path, new_fileName, fileName))
+        if res != 0:
+            pass
+            # logger.warning(f"jbig2dec处理失败")
+        # if os.path.exists(fileName):
+        #     os.remove(fileName)
+        if os.path.exists(new_fileName):
+            # logger.info(f"jbig2dec处理成功{fileName}>>{new_fileName}")
+            img_d["fileName"] = new_fileName
+            img_d["suffix"] = "png"
+            img_d["format"] = "png"
+            with open(new_fileName, "rb") as f:
+                data = f.read()
+                img_d["imgb64"] = str(base64.b64encode(data), encoding="utf-8")
+
+            # os.remove(new_fileName)
+
+    def jb22png(self, img_d: dict):
+        """
+        jb22png
+        没有安装 jbig2dec 无法操作
+        """
+
+        file_name = img_d["fileName"]
+        # print('jb2 file_name', file_name)
+        new_file_name = img_d['fileName'].replace(".jb2", ".png")
+        with open(file_name, "rb") as f:
+            data = f.read()
+        png_data = jbig2_parser.parse_jbig2(data)
+        png_bytes = bytes(png_data)
+        # print('png_data', png_data)
+
+        # # 将字节缓冲区转换为图像对象
+        # image = Image.open(io.BytesIO(png_data))
+        #
+        # # 保存图像为 PNG 文件
+        # image.save(new_file_name, 'PNG')
+
+        with open(new_file_name, 'wb') as f:
+            f.write(png_bytes)
+
+        if os.path.exists(new_file_name):
+            # logger.info(f"jbig2dec处理成功{fileName}>>{new_fileName}")
+            img_d["fileName"] = new_file_name
+            img_d["suffix"] = "png"
+            img_d["format"] = "png"
+            with open(new_file_name, "rb") as f:
+                data = f.read()
+                img_d["imgb64"] = str(base64.b64encode(data), encoding="utf-8")
+
+        # decoder = jbigkit.JbgDecoder()
+        # with open(file_name, "rb") as f:
+        #     data = f.read()
+        # status, processed_len = decoder.decode_in(data)
+        # if status != jbigkit.JbgErrno.EOK or processed_len != len(data):
+        #     print('jb2 file error!')
+        #     return
+        # assert status == jbigkit.JbgErrno.EOK
+        # assert processed_len == len(data)
+        #
+        # w, h = decoder.get_width(), decoder.get_height()
+        #
+        # ith_plane = decoder.get_plane(0)  # 获取第一个平面
+        # img = Image.frombytes('1', (w, h), bytes(ith_plane), 'raw', '1;I')
+        # img.save(new_file_name)
+
+        # os.remove(new_fileName)
+
+    def bmp2jpg(self, img_d: dict):
+
+        fileName = img_d["fileName"]
+        new_fileName = img_d['fileName'].replace(".bmp", ".jpg")
+        b64_nmp = self.get_xml_obj(fileName)
+        image_data = base64.b64decode(b64_nmp)
+        image = Image.open(io.BytesIO(image_data))
+        rgb_image = image.convert("RGB")
+        output_buffer = io.BytesIO()
+        rgb_image.save(output_buffer, format="JPEG")
+        image.close()
+        jpeg_bytes = output_buffer.getvalue()
+        b64_jpeg = base64.b64encode(jpeg_bytes).decode('utf-8')
+        output_buffer.close()
+
+        if b64_jpeg:
+            logger.info(f"bmp2jpg处理成功{fileName}>>{new_fileName}")
+            img_d["fileName"] = new_fileName
+            img_d["suffix"] = "jpg"
+            img_d["format"] = "jpg"
+            img_d["imgb64"] = b64_jpeg
+
+    def tif2jpg(self, img_d: dict):
+        fileName = img_d["fileName"]
+        new_fileName = img_d['fileName'].replace(".tif", ".jpg")
+        tif_nmp = self.get_xml_obj(fileName)
+        image_data = base64.b64decode(tif_nmp)
+        image = Image.open(io.BytesIO(image_data))
+        if image.mode in ("RGBA", "LA") or (image.mode == "P" and "transparency" in image.info):
+            image = image.convert("RGB")
+
+            # 创建一个字节流来保存处理后的图像
+        output_buffer = io.BytesIO()
+
+        # 保存图像为 JPEG 格式到字节流中
+        image.save(output_buffer, format="JPEG", quality=95)
+
+        # 获取字节流中的内容并编码为 Base64 字符串
+        jpeg_bytes = output_buffer.getvalue()
+        b64_jpeg = base64.b64encode(jpeg_bytes).decode('utf-8')
+
+        # 关闭图像对象和字节流
+        image.close()
+        output_buffer.close()
+
+        if b64_jpeg:
+            logger.info(f"tif2jpg处理成功{fileName}>>{new_fileName}")
+            img_d["fileName"] = new_fileName
+            img_d["suffix"] = "jpg"
+            img_d["format"] = "jpg"
+            img_d["imgb64"] = b64_jpeg
+
+    def gif2jpg(self, img_d: dict):
+        fileName = img_d["fileName"]
+        new_fileName = img_d['fileName'].replace(".bmp", ".jpg")
+        b64_gif = self.get_xml_obj(fileName)
+        image_data = base64.b64decode(b64_gif)
+        image = Image.open(io.BytesIO(image_data))
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        output_buffer = io.BytesIO()
+        image.save(output_buffer, format="JPEG", quality=95)
+        image.close()
+        jpeg_bytes = output_buffer.getvalue()
+        b64_jpeg = base64.b64encode(jpeg_bytes).decode('utf-8')
+        output_buffer.close()
+
+        if b64_jpeg:
+            logger.info(f"gif2jpg处理成功{fileName}>>{new_fileName}")
+            img_d["fileName"] = new_fileName
+            img_d["suffix"] = "jpg"
+            img_d["format"] = "jpg"
+            img_d["imgb64"] = b64_jpeg
+
+    def parser(self, save_dir):
+        """
+        解析流程
+        doc_0默认只有 一层
+        OFD >  Document.xml > [DocumentRes.xml, PublicRes.xml, Signatures.xml Annotations.xml] > []
+        """
+
+        page_size_details = []
+        default_page_size = []
+        doc_list = []
+        ofd_xml_obj = self.get_xml_obj(self.file_tree["root_doc"])  # OFD.xml xml 对象 
+
+        if ofd_xml_obj:
+            ofd_obj_res = OFDFileParser(ofd_xml_obj)()
+            doc_root_name = ofd_obj_res.get("doc_root")
+            signatures = ofd_obj_res.get("signatures")
+        else:
+            # 考虑根节点丢失情况
+            doc_root_name = ["Doc_0/Document.xml"]
+            signatures = ["Doc_0/Signs/Signatures.xml"]
+
+        doc_root_xml_obj = self.get_xml_obj(doc_root_name[0])
+        doc_root_info = DocumentFileParser(doc_root_xml_obj)()
+        doc_page_size = self.get_page_size(doc_root_xml_obj)
+        # print('doc_page_size', doc_page_size)
+
+        # 注释文本
+        annotations_root_name = doc_root_info.get("Annotations")
+        if annotations_root_name:
+            annotations_root_name = annotations_root_name[0]
+            annot_root_xml_obj = self.get_xml_obj(annotations_root_name)
+            # print('annot_root_xml_obj', annot_root_xml_obj)
+            annot_root_info = AnnotationFileParser(annot_root_xml_obj)()
+            # print('annot_root_info', annot_root_info)
+            doc_root_info.update(annot_root_info)
+        doc_size = doc_root_info.get("size")
+
+        if doc_size:
+            try:
+                default_page_size = [float(pos_i) for pos_i in doc_size.split(" ") if re.match("[\d\.]", pos_i)]
+            except:
+                traceback.print_exc()
+
+        # 字体信息
+        font_info = {}
+        public_res_name: list = doc_root_info.get("public_res")
+        if public_res_name:
+            public_xml_obj = self.get_xml_obj(public_res_name[0])
+            font_info = PublicResFileParser(public_xml_obj)()
+
+            # 注册字体
+            for font_id, font_v in font_info.items():
+                file_name = font_v.get("FontFile")
+                if file_name:
+                    font_b64 = self.get_xml_obj(file_name)
+                    if font_b64:
+                        font_v["font_b64"] = font_b64
+
+        # 图片资源
+        img_info: dict = dict()
+        document_res_name: list = doc_root_info.get("document_res")
+        # print('doc_root_info', doc_root_info)
+        if document_res_name:
+            document_res_xml_obj = self.get_xml_obj(document_res_name[0])
+            # print('document_res_xml_obj', document_res_xml_obj)
+            img_info = DocumentResFileParser(document_res_xml_obj)()
+            # 找到图片b64
+            for img_id, img_v in img_info.items():
+                img_v["imgb64"] = self.get_xml_obj(img_v.get("fileName"))
+                img_v['fileName'] = f"{save_dir}Doc_0\Res\{img_v['fileName']}"
+                # todo ib2 转png C:/msys64/mingw64/bin/jbig2dec.exe -o F:\code\easyofd\test\image_80.png F:\code\easyofd\test\image_80.jb2
+                if img_v["suffix"] == 'jb2':
+                    self.jb22png(img_v)
+                elif img_v["suffix"] == 'bmp':
+                    self.bmp2jpg(img_v)
+                elif img_v["suffix"] == 'tif':
+                    self.tif2jpg(img_v)
+                elif img_v["suffix"] == 'gif':
+                    self.gif2jpg(img_v)
+
+        img_info2: dict = dict()
+        public_res_name: list = doc_root_info.get("public_res")
+        # print('doc_root_info', doc_root_info)
+        if public_res_name:
+            public_res_xml_obj = self.get_xml_obj(public_res_name[0])
+            # print('public_res_xml_obj', public_res_xml_obj)
+            img_info2 = DocumentResFileParser(public_res_xml_obj)()
+            # 找到图片b64
+            for img_id, img_v in img_info2.items():
+                img_v["imgb64"] = self.get_xml_obj(img_v.get("fileName"))
+                # print('img_id, img_v[filename]', img_id, img_v.get('fileName'))
+                img_v['fileName'] = f"{save_dir}Doc_0\Res\{img_v['fileName']}"
+
+                # todo ib2 转png C:/msys64/mingw64/bin/jbig2dec.exe -o F:\code\easyofd\test\image_80.png F:\code\easyofd\test\image_80.jb2
+                if img_v["suffix"] == 'jb2':
+                    self.jb22png(img_v)
+                elif img_v["suffix"] == 'bmp':
+                    self.bmp2jpg(img_v)
+                elif img_v["suffix"] == 'tif':
+                    self.tif2jpg(img_v)
+                elif img_v["suffix"] == 'gif':
+                    self.gif2jpg(img_v)
+            img_info.update(img_info2)
+
+        page_id_map: list = doc_root_info.get("page_id_map")
+        # print('doc_root_info', doc_root_info)
+
+        signatures_page_id = {}
+        # 签章信息
+        signatures_xml_obj = None
+        # if signatures:
+        #     signatures_xml_obj = self.get_xml_obj(signatures[0])
+        # if signatures and signatures_xml_obj:
+        # # if signatures and (signatures_xml_obj := self.get_xml_obj(signatures[0])):
+        # #     logger.debug(f"signatures_xml_obj is {signatures_xml_obj } signatures is {signatures} ")
+        #     signatures_info = SignaturesFileParser(signatures_xml_obj)()
+        #     if signatures_info:  # 获取签章具体信息
+        #         for _, signatures_cell in signatures_info.items():
+        #             # print(signatures_info)
+        #             BaseLoc = signatures_cell.get("BaseLoc")
+        #             signature_xml_obj = self.get_xml_obj(BaseLoc)
+        #             # print(BaseLoc)
+        #             prefix = BaseLoc.split("/")[0]
+        #             signatures_info = SignatureFileParser(signature_xml_obj)(prefix=prefix)
+        #             # print(signatures_info)
+        #             # logger.debug(f"signatures_info {signatures_info}")
+        #             PageRef = signatures_info.get("PageRef")
+        #             Boundary = signatures_info.get("Boundary")
+        #             SignedValue = signatures_info.get("SignedValue")
+        #             sing_page_no = page_id_map.get(PageRef)
+        #             # print("self.file_tree",self.file_tree.keys)
+        #             # print(page_id_map,PageRef)
+        #             # print(SignedValue, self.get_xml_obj(SignedValue))
+        #             # with open("b64.txt","w") as f:
+        #             #     f.write(self.get_xml_obj(SignedValue))
+        #             if signatures_page_id.get(sing_page_no):
+        #                 signatures_page_id[sing_page_no].append(
+        #                     {
+        #                         "sing_page_no": sing_page_no,
+        #                         "PageRef": PageRef,
+        #                         "Boundary": Boundary,
+        #                         "SignedValue": self.get_xml_obj(SignedValue),
+        #                     }
+        #                 )
+        #             else:
+        #                 signatures_page_id[sing_page_no] = [
+        #                     {
+        #                         "sing_page_no": sing_page_no,
+        #                         "PageRef": PageRef,
+        #                         "Boundary": Boundary,
+        #                         "SignedValue": self.get_xml_obj(SignedValue),
+        #                     }
+        #                 ]
+
+        # 注释信息
+        # print('doc_root_info', doc_root_info)
+        # annotation_name: list = doc_root_info.get("Annotations")
+        # annotation_xml_obj = None
+        # if annotation_name:
+        #     annotation_xml_obj = self.get_xml_obj(annotation_name[0])
+        # if annotation_name and annotation_xml_obj:
+        # # if annotation_name and (annotation_xml_obj:= self.get_xml_obj(annotation_name[0])):
+        #     # todo 注释解析
+        #
+        #     # annotation_info = AnnotationFileParser(annotation_xml_obj)()
+        #     annotation_info = AnnotationFileParser(annotation_xml_obj)()
+        #     # logger.debug(f"annotation_info is {annotation_info}")
+
+
+        # 正文信息 会有多页 情况
+        page_name: list = doc_root_info.get("page")
+        page_info_d = {}
+        if page_name:
+            for index, _page in enumerate(page_name):
+                page_xml_obj = self.get_xml_obj(_page)
+                # 重新获取页面size
+                try:
+                    page_size = [float(pos_i) for pos_i in
+                                     page_xml_obj.get('ofd:Page', {}).get("ofd:Area", {}).get("ofd:PhysicalBox",
+                                                                                              "").split(" ")
+                                     if re.match("[\d\.]", pos_i)]
+                    if page_size and len(page_size) >= 2:
+                        page_size_details.append(page_size)
+                    else:
+                        if doc_page_size:
+                            page_size_details.append(doc_page_size)
+                        else:
+                            page_size_details.append([])
+                except Exception as e:
+                    traceback.print_exc()
+                    page_size.append([])
+                page_info = ContentFileParser(page_xml_obj)()
+                pg_no = re.search(r"\d+", _page)
+                if pg_no:
+                    pg_no = int(pg_no.group())
+                else:
+                    pg_no = index
+                page_info_d[pg_no] = page_info
+                # 只跑一页
+                # print('odf_parser parser() 只跑一页')
+                # break
+
+        # 注释作为正文提取
+        annot_page_info_d = {}
+        annot_page_name: list = doc_root_info.get("annot_page")
+        if annot_page_name:
+            for index, _page in enumerate(annot_page_name):
+                annot_page_xml_obj = self.get_xml_obj(_page)
+                annot_page_info = ContentFileParser(annot_page_xml_obj)()
+                pg_no = re.search(r"\d+", _page)
+                if pg_no:
+                    pg_no = int(pg_no.group())
+                else:
+                    pg_no = index
+
+                # 重新获取页面size
+                # try:
+                #     page_size = [float(pos_i) for pos_i in
+                #                  annot_page_xml_obj.get('ofd:Page', {}).get("ofd:Area", {}).get("ofd:PhysicalBox",
+                #                                                                           "").split(" ")
+                #                  if re.match("[\d\.]", pos_i)]
+                #     if page_size and len(page_size) >= 2:
+                #         # page_size_details.append(page_size)
+                #         pass
+                #     else:
+                #         page_size = []
+                # except Exception as e:
+                #     traceback.print_exc()
+                #     page_size.append([])
+                page_size = self.get_page_size(annot_page_xml_obj)
+                # if not page_size:
+                #     page_size = doc_page_size
+
+                # annot_page_info['annot_page_size'] = page_size
+                annot_page_info_d[pg_no] = annot_page_info
+                # 只跑一页
+                # print('odf_parser parser() 只跑一页')
+                # break
+        # 注释文本信息合到正文信息中
+        for page_id, page_d in page_info_d.items():
+            if page_id not in annot_page_info_d.keys():
+                continue
+            annot_page_d = annot_page_info_d.get(page_id)
+            # print("annot_page_d.get('text_list')", annot_page_d.get('text_list'))
+            page_d['text_list'] += annot_page_d.get('text_list')
+            page_d['annot_text_list'] = annot_page_d.get('text_list')
+            # page_d['annot_page_size'] = annot_page_d.get('annot_page_size')
+        # print('page_info_d', page_info_d)
+        # print('annot_page_info_d', annot_page_info_d)
+
+        # 模板信息
+        tpls_name: list = doc_root_info.get("tpls")
+        # if tpls_name:
+        #     for index, _tpl in enumerate(tpls_name):
+        #         tpl_xml_obj = self.get_xml_obj(_tpl)
+        #         tpl_info = ContentFileParser(tpl_xml_obj)()
+        #         tpl_no = re.search(r"\d+", _tpl)
+        #
+        #         if tpl_no:
+        #             tpl_no = int(tpl_no.group())
+        #         else:
+        #             tpl_no = index
+        #
+        #         if tpl_no in page_info_d:
+        #             page_info_d[pg_no]["text_list"].extend(tpl_info["text_list"])
+        #             page_info_d[pg_no]["text_list"].sort(
+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
+        #             page_info_d[pg_no]["img_list"].extend(tpl_info["img_list"])
+        #             page_info_d[pg_no]["img_list"].sort(
+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
+        #             page_info_d[pg_no]["line_list"].extend(tpl_info["line_list"])
+        #             page_info_d[pg_no]["line_list"].sort(
+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
+        #         else:
+        #             page_info_d[tpl_no] = tpl_info
+        #             page_info_d[tpl_no].sort(
+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
+
+        # todo 读取注释信息
+        page_ID = 0  # 没遇到过doc多个的情况
+        # print("page_info",len(page_info))
+        doc_list.append({
+            "default_page_size": default_page_size,
+            "page_size": page_size_details,
+            "pdf_name": self.file_tree["pdf_name"],
+            "doc_no": page_ID,
+            "images": img_info,
+            "signatures_page_id": signatures_page_id,
+            "page_id_map": page_id_map,
+            "fonts": font_info,
+            "page_info": page_info_d,
+            "page_tpl_info": page_info_d,
+            "page_content_info": page_info_d,
+            # "annot_page_info": annot_page_info_d,
+        })
+        return doc_list
+
+    def get_page_size(self, page_xml_obj):
+        try:
+            page_size = [float(pos_i) for pos_i in page_xml_obj.get('ofd:Page', {}).get("ofd:Area", {}).get("ofd:PhysicalBox", "").split(" ")if re.match("[\d\.]", pos_i)]
+            if not (page_size and len(page_size) >= 2):
+                page_size = [float(pos_i) for pos_i in page_xml_obj.get('ofd:Document', {}).get('ofd:CommonData', {}).get("ofd:PageArea", {}).get("ofd:PhysicalBox", "").split(" ")if re.match("[\d\.]", pos_i)]
+                if not (page_size and len(page_size) >= 2):
+                    page_size = []
+        except Exception as e:
+            traceback.print_exc()
+            page_size = []
+        return page_size
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        """
+        输出ofd解析结果
+        """
+        save_xml = kwargs.get("save_xml", False)
+        xml_name = kwargs.get("xml_name")
+        save_dir = kwargs.get("save_dir")
+        self.file_tree = FileRead(self.ofdb64)(save_xml=save_xml, xml_name=xml_name, save_dir=save_dir)
+        # logger.info(self.file_tree)
+        return self.parser(save_dir)
+
+
+if __name__ == "__main__":
+    p = "C:/Users/Administrator/Downloads/1750060386706.ofd"
+    with open(p, "rb") as f:
+        ofdb64 = str(base64.b64encode(f.read()), "utf-8")
+    obj_list = OFDParser(ofdb64)()
+    for obj in obj_list:
+        print('obj', obj)

+ 31 - 0
format_convert/easyofd/easyofd/parser_ofd/parameter_parser.py

@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: easyofd
+# CREATE_TIME: 
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: renoyuan
+# note:参数解析器
+from loguru import logger
+from typing import List, Dict, Any, Union, Tuple, Optional
+
+
+class ParameterParser(object):
+    parameter = {
+        "ofd:FillColor": (dict, dict),
+        "ofd:StrokeColor": (dict, dict),
+        "ofd:Test": ((str, int), str),
+        "ofd:Font": (str, str),
+        "@Value": (str, str)
+    }
+
+    def __call__(self, key, container):
+        if key in ParameterParser.parameter:
+            v = container.get(key, None)
+            t = ParameterParser.parameter[key]
+            if isinstance(v, t[0]):
+                return v
+            else:
+                return t[1]()
+        else:
+            logger.warning(f"{key} not in ParameterParser")
+            return None

+ 61 - 0
format_convert/easyofd/easyofd/parser_ofd/path_parser.py

@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  path_parser.py
+# CREATE_TIME: 2025/4/9 16:31
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE:
+from enum import Enum
+import os
+
+class PathType(Enum):
+    absolutely = 1
+    relative = 2
+
+class PathParser:
+    """
+    Parser Path
+    路径解析器
+    解析文件路径返回绝对路径
+    "/ROOT/a.xml"
+    "./ROOT/a.xml"
+    "../ROOT/a.xml"
+    "ROOT/a.xml"
+    """
+
+    def __init__(self, root_path:str):
+        if os.name == 'nt':
+            self.os = "nt"
+        else:
+            self.os = "posix"
+
+        self.root_path = self.format_path(root_path)
+
+    def format_path(self,path:str):
+        normalized = os.path.normpath(path)
+        if self.os == "nt":
+            return normalized.replace("/","\\")
+        else:
+            return normalized.replace("\\","/")
+
+    def get_path_type(self, path:str):
+        if os.path.isabs(path):
+            return PathType.absolutely
+        else:
+            return PathType.relative
+
+    def __call__(self,cur_path:str,loc_path:str):
+        """
+        loc_path is posix style
+        """
+        path_type = self.get_path_type(loc_path)
+        if path_type == PathType.absolutely:
+            return self.format_path(loc_path)
+        if path_type == PathType.relative:
+            if loc_path.startswith("./"):
+                path = os.path.join(cur_path, self.format_path(loc_path[2:]))
+            elif loc_path.startswith("../"):
+                path = os.path.join(os.path.dirname(cur_path), self.format_path(loc_path[3:]))
+            else:
+                path = os.path.join(os.path.dirname(cur_path), self.format_path(loc_path))
+            return path

+ 7 - 0
format_convert/easyofd/easyofd/template_ofd/__init__.py

@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME:  __init__.py.py
+# CREATE_TIME: 2025/3/28 15:43
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE:

+ 53 - 0
format_convert/font_map/extend_to_normal_dict.txt

@@ -0,0 +1,53 @@
+{
+ "⺁":"厂",
+ "⺇":"几",
+ "⺌":"小",
+ "⺎":"兀",
+ "⺏":"尣",
+ "⺐":"尢",
+ "⺑":"𡯂",
+ "⺒":"巳",
+ "⺓":"幺",
+ "⺛":"旡",
+ "⺝":"月",
+ "⺟":"母",
+ "⺠":"民",
+ "⺱":"冈",
+ "⺸":"芈",
+ "⻁":"虎",
+ "⻄":"西",
+ "⻅":"见",
+ "⻆":"角",
+ "⻇":"𧢲",
+ "⻉":"贝",
+ "⻋":"车",
+ "⻒":"镸",
+ "⻓":"长",
+ "⻔":"门",
+ "⻗":"雨",
+ "⻘":"青",
+ "⻙":"韦",
+ "⻚":"页",
+ "⻛":"风",
+ "⻜":"飞",
+ "⻝":"食",
+ "⻡":"𩠐",
+ "⻢":"马",
+ "⻣":"骨",
+ "⻤":"鬼",
+ "⻥":"鱼",
+ "⻦":"鸟",
+ "⻧":"卤",
+ "⻨":"麦",
+ "⻩":"黄",
+ "⻬":"齐",
+ "⻮":"齿",
+ "⻯":"竜",
+ "⻰":"龙",
+ "⻳":"龟",
+ "⾅":"臼",
+ "⼝":"口",
+ "⼾":"户",
+ "⼉":"儿",
+ "⼱":"巾"
+}

+ 214 - 0
format_convert/font_map/kangxi_to_normal

@@ -0,0 +1,214 @@
+⼀ 2F00 一 4E00
+⼁ 2F01 丨 4E28
+⼂ 2F02 丶 4E36
+⼃ 2F03 丿 4E3F
+⼄ 2F04 乙 4E59
+⼅ 2F05 亅 4E85
+⼆ 2F06 二 4E8C
+⼇ 2F07 亠 4EA0
+⼈ 2F08 人 4EBA
+⼉ 2F09 儿 513F
+⼊ 2F0A 入 5165
+⼋ 2F0B 八 516B
+⼌ 2F0C 冂 5182
+⼍ 2F0D 冖 5196
+⼎ 2F0E 冫 51AB 
+⼏ 2F0F 几 51E0
+⼐ 2F10 凵 51F5
+⼑ 2F11 刀 5200
+⼒ 2F12 力 529B
+⼓ 2F13 勹 52F9
+⼔ 2F14 匕 5315 
+⼕ 2F15 匚 531A 
+⼖ 2F16 匸 5338 
+⼗ 2F17 十 5341
+⼘ 2F18 卜 535C
+⼙ 2F19 卩 5369
+⼚ 2F1A 厂 5382
+⼛ 2F1B 厶 53B6
+⼜ 2F1C 又 53C8
+⼝ 2F1D 口 53E3
+⼞ 2F1E 囗 56D7
+⼟ 2F1F 土 571F
+⼠ 2F20 士 58EB
+⼡ 2F21 夂 5902
+⼢ 2F22 夊 590A
+⼣ 2F23 夕 5915
+⼤ 2F24 大 5927
+⼥ 2F25 女 5973
+⼦ 2F26 子 5B50
+⼧ 2F27 宀 5B80
+⼨ 2F28 寸 5BF8
+⼩ 2F29 小 5C0F
+⼪ 2F2A 尢 5C22
+⼫ 2F2B 尸 5C38
+⼬ 2F2C 屮 5C6E
+⼭ 2F2D 山 5C71
+⼮ 2F2E 巛 5DDB
+⼯ 2F2F 工 5DE5
+⼰ 2F30 己 5DF1
+⼱ 2F31 巾 5DFE
+⼲ 2F32 干 5E72
+⼳ 2F33 幺 5E7A
+⼴ 2F34 广 5E7F
+⼵ 2F35 廴 5EF4
+⼶ 2F36 廾 5EFE
+⼷ 2F37 弋 5F0B
+⼸ 2F38 弓 5F13
+⼹ 2F39 彐 5F50
+⼺ 2F3A 彡 5F61
+⼻ 2F3B 彳 5F73
+⼼ 2F3C 心 5FC3
+⼽ 2F3D 戈 6208
+⼾ 2F3E 戶 6236
+⼿ 2F3F 手 624B
+⽀ 2F40 支 652F
+⽁ 2F41 攴 6534
+⽂ 2F42 文 6587
+⽃ 2F43 斗 6597
+⽄ 2F44 斤 65A4
+⽅ 2F45 方 65B9
+⽆ 2F46 无 65E0
+⽇ 2F47 日 65E5
+⽈ 2F48 曰 66F0
+⽉ 2F49 月 6708
+⽊ 2F4A 木 6728
+⽋ 2F4B 欠 6B20
+⽌ 2F4C 止 6B62
+⽍ 2F4D 歹 6B79
+⽎ 2F4E 殳 6BB3
+⽏ 2F4F 毋 6BCB
+⽐ 2F50 比 6BD4
+⽑ 2F51 毛 6BDB
+⽒ 2F52 氏 6C0F
+⽓ 2F53 气 6C14
+⽔ 2F54 水 6C34
+⽕ 2F55 火 706B
+⽖ 2F56 爪 722A
+⽗ 2F57 父 7236
+⽘ 2F58 爻 723B
+⽙ 2F59 爿 723F
+⽚ 2F5A 片 7247
+⽛ 2F5B 牙 7259
+⽜ 2F5C 牛 725B
+⽝ 2F5D 犬 72AC
+⽞ 2F5E 玄 7384
+⽟ 2F5F 玉 7389
+⽠ 2F60 瓜 74DC
+⽡ 2F61 瓦 74E6
+⽢ 2F62 甘 7518
+⽣ 2F63 生 751F
+⽤ 2F64 用 7528
+⽥ 2F65 田 7530
+⽦ 2F66 疋 758B
+⽧ 2F67 疒 7592
+⽨ 2F68 癶 7676
+⽩ 2F69 白 767D
+⽪ 2F6A 皮 76AE
+⽫ 2F6B 皿 76BF
+⽬ 2F6C 目 76EE
+⽭ 2F6D 矛 77DB
+⽮ 2F6E 矢 77E2
+⽯ 2F6F 石 77F3
+⽰ 2F70 示 793A
+⽱ 2F71 禸 79B8
+⽲ 2F72 禾 79BE
+⽳ 2F73 穴 7A74
+⽴ 2F74 立 7ACB
+⽵ 2F75 竹 7AF9
+⽶ 2F76 米 7C73
+⽷ 2F77 糸 7CF8
+⽸ 2F78 缶 7F36
+⽹ 2F79 网 7F51
+⽺ 2F7A 羊 7F8A
+⽻ 2F7B 羽 7FBD
+⽼ 2F7C 老 8001
+⽽ 2F7D 而 800C
+⽾ 2F7E 耒 8012
+⽿ 2F7F 耳 8033
+⾀ 2F80 聿 807F
+⾁ 2F81 肉 8089
+⾂ 2F82 臣 81E3
+⾃ 2F83 自 81EA
+⾄ 2F84 至 81F3
+⾅ 2F85 臼 81FC
+⾆ 2F86 舌 820C
+⾇ 2F87 舛 821B
+⾈ 2F88 舟 821F
+⾉ 2F89 艮 826E
+⾊ 2F8A 色 8272
+⾋ 2F8B 艸 8278
+⾌ 2F8C 虍 864D
+⾍ 2F8D 虫 866B
+⾎ 2F8E 血 8840
+⾏ 2F8F 行 884C
+⾐ 2F90 衣 8863
+⾑ 2F91 襾 897E
+⾒ 2F92 見 898B
+⾓ 2F93 角 89D2
+⾔ 2F94 言 8A00
+⾕ 2F95 谷 8C37
+⾖ 2F96 豆 8C46
+⾗ 2F97 豕 8C55
+⾘ 2F98 豸 8C78
+⾙ 2F99 貝 8C9D
+⾚ 2F9A 赤 8D64
+⾛ 2F9B 走 8D70
+⾜ 2F9C 足 8DB3
+⾝ 2F9D 身 8EAB
+⾞ 2F9E 車 8ECA
+⾟ 2F9F 辛 8F9B
+⾠ 2FA0 辰 8FB0
+⾡ 2FA1 辵 8FB5
+⾢ 2FA2 邑 9091
+⾣ 2FA3 酉 9149
+⾤ 2FA4 采 91C7
+⾥ 2FA5 里 91CC
+⾦ 2FA6 金 91D1
+⾧ 2FA7 長 9577
+⾨ 2FA8 門 9580
+⾩ 2FA9 阜 961C
+⾪ 2FAA 隶 96B6
+⾫ 2FAB 隹 96B9
+⾬ 2FAC 雨 96E8
+⾭ 2FAD 青 9752
+⾮ 2FAE 非 975E
+⾯ 2FAF 面 9762
+⾰ 2FB0 革 9769
+⾱ 2FB1 韋 97CB
+⾲ 2FB2 韭 97ED
+⾳ 2FB3 音 97F3
+⾴ 2FB4 頁 9801
+⾵ 2FB5 風 98A8
+⾶ 2FB6 飛 98DB
+⾷ 2FB7 食 98DF
+⾸ 2FB8 首 9996
+⾹ 2FB9 香 9999
+⾺ 2FBA 馬 99AC
+⾻ 2FBB 骨 9AA8
+⾼ 2FBC 高 9AD8
+⾽ 2FBD 髟 9ADF
+⾾ 2FBE 鬥 9B25
+⾿ 2FBF 鬯 9B2F
+⿀ 2FC0 鬲 9B32
+⿁ 2FC1 鬼 9B3C
+⿂ 2FC2 魚 9B5A
+⿃ 2FC3 鳥 9CE5
+⿄ 2FC4 鹵 9E75
+⿅ 2FC5 鹿 9E7F
+⿆ 2FC6 麥 9EA5
+⿇ 2FC7 麻 9EBB
+⿈ 2FC8 黃 9EC3
+⿉ 2FC9 黍 9ECD
+⿊ 2FCA 黑 9ED1
+⿋ 2FCB 黹 9EF9
+⿌ 2FCC 黽 9EFD
+⿍ 2FCD 鼎 9F0E
+⿎ 2FCE 鼓 9F13
+⿏ 2FCF 鼠 9F20
+⿐ 2FD0 鼻 9F3B
+⿑ 2FD1 齊 9F4A
+⿒ 2FD2 齒 9F52
+⿓ 2FD3 龍 9F8D
+⿔ 2FD4 龜 9F9C
+⿕ 2FD5 龠 9FA0

+ 154 - 0
format_convert/font_map/kangxi_to_normal_dict.txt

@@ -0,0 +1,154 @@
+{
+    "⼀": "一",
+    "⼄": "乙",
+    "⼆": "二",
+    "⼈": "人",
+    "⼉": "儿",
+    "⼊": "入",
+    "⼋": "八",
+    "⼏": "几",
+    "⼑": "刀",
+    "⼒": "力",
+    "⼔": "匕",
+    "⼗": "十",
+    "⼘": "卜",
+    "⼚": "厂",
+    "⼜": "又",
+    "⼝": "口",
+    "⼞": "口",
+    "⼟": "土",
+    "⼠": "士",
+    "⼤": "大",
+    "⼥": "女",
+    "⼦": "子",
+    "⼨": "寸",
+    "⼩": "小",
+    "⼫": "尸",
+    "⼭": "山",
+    "⼯": "工",
+    "⼰": "己",
+    "⼲": "干",
+    "⼴": "广",
+    "⼸": "弓",
+    "⼼": "心",
+    "⼽": "戈",
+    "⼿": "手",
+    "⽀": "支",
+    "⽂": "文",
+    "⽃": "斗",
+    "⽄": "斤",
+    "⽅": "方",
+    "⽆": "无",
+    "⽇": "日",
+    "⽈": "曰",
+    "⽉": "月",
+    "⽊": "木",
+    "⽋": "欠",
+    "⽌": "止",
+    "⽍": "歹",
+    "⽏": "毋",
+    "⽐": "比",
+    "⽑": "毛",
+    "⽒": "氏",
+    "⽓": "气",
+    "⽔": "水",
+    "⽕": "火",
+    "⽖": "爪",
+    "⽗": "父",
+    "⽚": "片",
+    "⽛": "牙",
+    "⽜": "牛",
+    "⽝": "犬",
+    "⽞": "玄",
+    "⽟": "玉",
+    "⽠": "瓜",
+    "⽡": "瓦",
+    "⽢": "甘",
+    "⽣": "生",
+    "⽤": "用",
+    "⽥": "田",
+    "⽩": "白",
+    "⽪": "皮",
+    "⽫": "皿",
+    "⽬": "目",
+    "⽭": "矛",
+    "⽮": "矢",
+    "⽯": "石",
+    "⽰": "示",
+    "⽲": "禾",
+    "⽳": "穴",
+    "⽴": "立",
+    "⽵": "竹",
+    "⽶": "米",
+    "⽸": "缶",
+    "⽹": "网",
+    "⽺": "羊",
+    "⽻": "羽",
+    "⽼": "老",
+    "⽽": "而",
+    "⽿": "耳",
+    "⾁": "肉",
+    "⾂": "臣",
+    "⾃": "自",
+    "⾄": "至",
+    "⾆": "舌",
+    "⾈": "舟",
+    "⾉": "艮",
+    "⾊": "色",
+    "⾍": "虫",
+    "⾎": "血",
+    "⾏": "行",
+    "⾐": "衣",
+    "⾒": "儿",
+    "⾓": "角",
+    "⾔": "言",
+    "⾕": "谷",
+    "⾖": "豆",
+    "⾚": "赤",
+    "⾛": "走",
+    "⾜": "足",
+    "⾝": "身",
+    "⾞": "车",
+    "⾟": "辛",
+    "⾠": "辰",
+    "⾢": "邑",
+    "⾣": "酉",
+    "⾤": "采",
+    "⾥": "里",
+    "⾦": "金",
+    "⾧": "长",
+    "⾨": "门",
+    "⾩": "阜",
+    "⾪": "隶",
+    "⾬": "雨",
+    "⾭": "青",
+    "⾮": "非",
+    "⾯": "面",
+    "⾰": "革",
+    "⾲": "韭",
+    "⾳": "音",
+    "⾴": "页",
+    "⾵": "风",
+    "⾶": "飞",
+    "⾷": "食",
+    "⾸": "首",
+    "⾹": "香",
+    "⾺": "马",
+    "⾻": "骨",
+    "⾼": "高",
+    "⿁": "鬼",
+    "⿂": "鱼",
+    "⿃": "鸟",
+    "⿄": "卤",
+    "⿅": "鹿",
+    "⿇": "麻",
+    "⿉": "黍",
+    "⿊": "黑",
+    "⿍": "鼎",
+    "⿎": "鼓",
+    "⿏": "鼠",
+    "⿐": "鼻",
+    "⿒": "齿",
+    "⿓": "龙",
+    "⼣": "夕"
+}

+ 327 - 0
format_convert/ofd/ofd_parser.py

@@ -0,0 +1,327 @@
+import os
+import zipfile
+import xml.etree.ElementTree as ET
+from typing import Dict, List, Any, Optional
+from pathlib import Path
+
+
+class OFDParser:
+    """OFD文件解析器"""
+
+    def __init__(self, ofd_path: str):
+        """初始化解析器并验证OFD文件"""
+        self.ofd_path = ofd_path
+        self.temp_dir = Path("./ofd_temp")
+        self.ofd_info = {}
+        self.documents = []
+
+        if not os.path.exists(ofd_path):
+            raise FileNotFoundError(f"OFD文件不存在: {ofd_path}")
+
+        if not zipfile.is_zipfile(ofd_path):
+            raise ValueError(f"文件不是有效的OFD文件(Zip格式): {ofd_path}")
+
+    def parse(self) -> Dict[str, Any]:
+        """解析OFD文件并返回内容结构"""
+        try:
+            self._extract_ofd()
+            self._parse_ofd_xml()
+            self._parse_documents()
+            return {
+                "file_info": self.ofd_info,
+                "documents": self.documents
+            }
+        finally:
+            self._cleanup()
+
+    def _extract_ofd(self) -> None:
+        """解压OFD文件到临时目录"""
+        self.temp_dir.mkdir(exist_ok=True)
+        with zipfile.ZipFile(self.ofd_path, 'r') as zip_ref:
+            zip_ref.extractall(self.temp_dir)
+
+    def _parse_ofd_xml(self) -> None:
+        """解析OFD.xml文件获取基本信息"""
+        ofd_xml_path = self.temp_dir / "OFD.xml"
+        if not ofd_xml_path.exists():
+            raise ValueError("OFD.xml文件缺失")
+
+        root = ET.parse(ofd_xml_path).getroot()
+        namespace = {'ofd': 'http://www.ofdspec.org/2016'}
+
+        # 解析文档基本信息
+        doc_body = root.find('ofd:DocBody', namespace)
+        if doc_body is not None:
+            # 解析文档根信息
+            doc_file = doc_body.find('ofd:DocFile', namespace)
+            if doc_file is not None:
+                self.ofd_info['doc_file'] = doc_file.text
+
+            # 解析签名信息
+            signatures = doc_body.find('ofd:Signatures', namespace)
+            if signatures is not None:
+                self.ofd_info['signatures'] = {
+                    'file': signatures.get('FileRef'),
+                    'count': int(signatures.get('Count', 0))
+                }
+
+    def _parse_documents(self) -> None:
+        """解析文档内容"""
+        # 获取所有Document.xml文件
+        doc_xml_files = list(self.temp_dir.rglob("Document.xml"))
+        for doc_xml in doc_xml_files:
+            doc_info = self._parse_document(doc_xml)
+            self.documents.append(doc_info)
+
+    def _parse_document(self, doc_xml_path: Path) -> Dict[str, Any]:
+        """解析单个文档"""
+        namespace = {'ofd': 'http://www.ofdspec.org/2016'}
+        root = ET.parse(doc_xml_path).getroot()
+
+        document = {
+            'path': str(doc_xml_path),
+            'pages': [],
+            'fonts': self._parse_fonts(root, namespace),
+            'metadata': self._parse_metadata(root, namespace)
+        }
+
+        # 解析页面信息
+        pages_node = root.find('.//ofd:Pages', namespace)
+        if pages_node is not None:
+            page_references = pages_node.findall('ofd:Page', namespace)
+            for page_ref in page_references:
+                page_id = page_ref.get('ID')
+                page_file = page_ref.find('ofd:PageFile', namespace)
+                if page_file is not None:
+                    page_path = self.temp_dir / page_file.text
+                    if page_path.exists():
+                        page_info = self._parse_page(page_path)
+                        document['pages'].append({
+                            'id': page_id,
+                            'content': page_info
+                        })
+
+        return document
+
+    def _parse_fonts(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, str]]:
+        """解析文档字体信息"""
+        fonts = []
+        font_list = root.find('.//ofd:Fonts', ns)
+        if font_list is not None:
+            for font_node in font_list.findall('ofd:Font', ns):
+                font = {
+                    'id': font_node.get('ID'),
+                    'name': font_node.get('FontName'),
+                    'family': font_node.get('FamilyName'),
+                    'format': font_node.get('FontFormat'),
+                    'bold': font_node.get('Bold') == 'true',
+                    'italic': font_node.get('Italic') == 'true',
+                    'serif': font_node.get('Serif') == 'true',
+                    'fixed_width': font_node.get('FixedWidth') == 'true'
+                }
+                fonts.append(font)
+        return fonts
+
+    def _parse_metadata(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, str]:
+        """解析文档元数据"""
+        metadata = {}
+        doc_info = root.find('.//ofd:DocInfo', ns)
+        if doc_info is not None:
+            for attr in ['Title', 'Author', 'Subject', 'Keywords', 'Creator',
+                         'CreatorVersion', 'CreationDate', 'ModDate']:
+                element = doc_info.find(f'ofd:{attr}', ns)
+                if element is not None and element.text:
+                    metadata[attr] = element.text
+        return metadata
+
+    def _parse_page(self, page_path: Path) -> Dict[str, Any]:
+        """解析页面内容"""
+        namespace = {
+            'ofd': 'http://www.ofdspec.org/2016',
+            'ofdtext': 'http://www.ofdspec.org/2016',
+            'ofdgraph': 'http://www.ofdspec.org/2016',
+            'ofdimg': 'http://www.ofdspec.org/2016'
+        }
+        root = ET.parse(page_path).getroot()
+
+        page = {
+            'size': self._parse_page_size(root, namespace),
+            'text_content': self._extract_text_content(root, namespace),
+            'images': self._extract_images(root, namespace),
+            'graphics': self._extract_graphics(root, namespace),
+            'layers': self._parse_layers(root, namespace)
+        }
+
+        return page
+
+    def _parse_page_size(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, float]:
+        """解析页面尺寸"""
+        box = root.find('.//ofd:Area/ofd:PhysicalBox', ns)
+        if box is not None:
+            return {
+                'width': float(box.get('Width', 0)),
+                'height': float(box.get('Height', 0)),
+                'x': float(box.get('x', 0)),
+                'y': float(box.get('y', 0))
+            }
+        return {'width': 0, 'height': 0, 'x': 0, 'y': 0}
+
+    def _extract_text_content(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
+        """提取页面文本内容,包含位置和样式信息"""
+        text_objects = root.findall('.//ofdtext:TextObject', ns)
+        texts = []
+
+        for text_obj in text_objects:
+            # 获取文本对象的基本属性
+            text_info = {
+                'id': text_obj.get('ID'),
+                'bounding_box': {
+                    'x': float(text_obj.get('BoundaryBox').split()[0]),
+                    'y': float(text_obj.get('BoundaryBox').split()[1]),
+                    'width': float(text_obj.get('BoundaryBox').split()[2]),
+                    'height': float(text_obj.get('BoundaryBox').split()[3])
+                },
+                'transform': text_obj.get('CTM'),
+                'content': []
+            }
+
+            # 获取文本样式
+            style = text_obj.find('ofdtext:TextStyle', ns)
+            if style is not None:
+                text_info['style'] = {
+                    'font': style.get('Font'),
+                    'size': float(style.get('Size', 0)),
+                    'color': style.get('FillColor'),
+                    'weight': style.get('Weight'),
+                    'italic': style.get('Italic') == 'true',
+                    'underline': style.get('Underline') == 'true',
+                    'strikeout': style.get('StrikeOut') == 'true'
+                }
+
+            # 提取实际文本内容
+            text_codecs = text_obj.findall('ofdtext:TextCode', ns)
+            for codec in text_codecs:
+                if codec.text:
+                    text_info['content'].append({
+                        'text': codec.text.strip(),
+                        'position': {
+                            'x': float(codec.get('X', 0)),
+                            'y': float(codec.get('Y', 0))
+                        }
+                    })
+
+            if text_info['content']:
+                texts.append(text_info)
+
+        return texts
+
+    def _extract_images(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
+        """提取页面中的图像信息"""
+        images = []
+        image_objects = root.findall('.//ofdimg:ImageObject', ns)
+
+        for img_obj in image_objects:
+            image = {
+                'id': img_obj.get('ID'),
+                'bounding_box': {
+                    'x': float(img_obj.get('BoundaryBox').split()[0]),
+                    'y': float(img_obj.get('BoundaryBox').split()[1]),
+                    'width': float(img_obj.get('BoundaryBox').split()[2]),
+                    'height': float(img_obj.get('BoundaryBox').split()[3])
+                },
+                'resource_id': img_obj.get('ResourceID'),
+                'transform': img_obj.get('CTM')
+            }
+            images.append(image)
+
+        return images
+
+    def _extract_graphics(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
+        """提取页面中的图形信息"""
+        graphics = []
+        graphic_objects = root.findall('.//ofdgraph:PathObject', ns)
+
+        for graphic_obj in graphic_objects:
+            graphic = {
+                'id': graphic_obj.get('ID'),
+                'bounding_box': {
+                    'x': float(graphic_obj.get('BoundaryBox').split()[0]),
+                    'y': float(graphic_obj.get('BoundaryBox').split()[1]),
+                    'width': float(graphic_obj.get('BoundaryBox').split()[2]),
+                    'height': float(graphic_obj.get('BoundaryBox').split()[3])
+                },
+                'fill_color': graphic_obj.get('FillColor'),
+                'stroke_color': graphic_obj.get('StrokeColor'),
+                'line_width': float(graphic_obj.get('LineWidth', 0)),
+                'path_data': graphic_obj.find('ofdgraph:PathData', ns).text if graphic_obj.find('ofdgraph:PathData',
+                                                                                                ns) is not None else ''
+            }
+            graphics.append(graphic)
+
+        return graphics
+
+    def _parse_layers(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
+        """解析页面图层信息"""
+        layers = []
+        layer_nodes = root.findall('.//ofd:Layer', ns)
+
+        for layer in layer_nodes:
+            layer_info = {
+                'type': layer.get('Type'),
+                'objects': {
+                    'text': len(layer.findall('.//ofdtext:TextObject', ns)),
+                    'images': len(layer.findall('.//ofdimg:ImageObject', ns)),
+                    'graphics': len(layer.findall('.//ofdgraph:PathObject', ns))
+                }
+            }
+            layers.append(layer_info)
+
+        return layers
+
+    def _cleanup(self) -> None:
+        """清理临时文件"""
+        import shutil
+        # if self.temp_dir.exists():
+        #     shutil.rmtree(self.temp_dir)
+
+
+# 使用示例
+if __name__ == "__main__":
+    try:
+        p = "C:/Users/Administrator/Downloads/1750060386706.ofd"
+        parser = OFDParser(p)
+        result = parser.parse()
+
+        # 打印文档基本信息
+        print("文档信息:", result["file_info"])
+
+        # 打印所有页面的文本内容
+        for doc_idx, document in enumerate(result["documents"], 1):
+            print(f"\n文档 {doc_idx}:")
+            print(f"  字体数量: {len(document['fonts'])}")
+            print(f"  页面数量: {len(document['pages'])}")
+
+            # 打印文档元数据
+            if document['metadata']:
+                print("  元数据:")
+                for key, value in document['metadata'].items():
+                    print(f"    {key}: {value}")
+
+            # 打印页面内容摘要
+            for page_idx, page in enumerate(document["pages"], 1):
+                print(f"\n  页面 {page_idx}:")
+                print(f"    尺寸: {page['content']['size']['width']} x {page['content']['size']['height']}")
+                print(f"    文本元素: {len(page['content']['text_content'])}")
+                print(f"    图像元素: {len(page['content']['images'])}")
+                print(f"    图形元素: {len(page['content']['graphics'])}")
+                print(f"    图层数量: {len(page['content']['layers'])}")
+
+                # 打印前5行文本
+                if page['content']['text_content']:
+                    print("    前5行文本:")
+                    for i, text_elem in enumerate(page['content']['text_content'][:5]):
+                        text_lines = " ".join([t['text'] for t in text_elem['content']])
+                        print(f"      {i + 1}. {text_lines[:50]}{'...' if len(text_lines) > 50 else ''}")
+
+    except Exception as e:
+        print(f"解析OFD文件时出错: {e}")

+ 320 - 12
format_convert/utils.py

@@ -9,13 +9,18 @@ import pickle
 import socket
 import subprocess
 import sys
+from glob import glob
 from io import BytesIO
 from subprocess import Popen
+import pynvml
+import datetime
+import PyPDF2
 from shapely.geometry import LineString
 import cv2
 import requests
 from PIL import Image
-
+from reportlab.pdfbase import pdfmetrics
+from reportlab.pdfbase.ttfonts import TTFont
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 import difflib
 import logging
@@ -43,6 +48,14 @@ from shapely.geometry import Polygon
 
 config_file_path = os.path.dirname(os.path.abspath(__file__)) + "/../config/interface_new.yml"
 
+# 特殊中文转基本中文
+with open(os.path.abspath(os.path.dirname(__file__)) + '/font_map/extend_to_normal_dict.txt', 'r', encoding='utf-8') as f:
+    extend_to_normal_dict = f.read()
+    extend_to_normal_dict = eval(extend_to_normal_dict)
+with open(os.path.abspath(os.path.dirname(__file__)) + '/font_map/kangxi_to_normal_dict.txt', 'r', encoding='utf-8') as f:
+    kangxi_to_normal_dict = f.read()
+    kangxi_to_normal_dict = eval(kangxi_to_normal_dict)
+
 
 def has_intersection(poly1, poly2):
     """
@@ -62,7 +75,7 @@ def has_intersection(poly1, poly2):
 
 
 def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13,
-                                  -14, -15, -16, -17, -18, -19, -20, -21, -22]):
+                                  -14, -15, -16, -17, -18, -19, -20, -21, -22, -23]):
     """
     [0] : continue
     [-1]: 逻辑处理错误
@@ -87,6 +100,7 @@ def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -1
     [-20]: requests请求超时
     [-21]: requests请求返回错误状态码
     [-22]: requests请求拒绝连接
+    [-23]: 两列无边框表格提取报错
     """
     for c in code:
         if isinstance(_list, list) and _list == [c]:
@@ -366,11 +380,45 @@ def slash_replace(_str, reverse=False):
     return _str
 
 
+def align_table_lines(line_list, threshold=7):
+    """
+    对齐横线竖线,包括越过合并单元格的线
+    否则在生成表格时会因为线错位出错
+
+    :return:
+    """
+    rows = []
+    cols = []
+    for line in line_list:
+        x0, y0, x1, y1 = line.bbox
+        if abs(x0-x1) > abs(y0-y1):
+            rows.append(line)
+        else:
+            cols.append(line)
+    if not rows or not cols:
+        return line_list
+
+    rows.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
+    last_line = rows[0]
+    for line in rows[1:]:
+        if abs(line.bbox[1] - last_line.bbox[1]) <= threshold and line.bbox[1] != last_line.bbox[1]:
+            last_line.bbox = (last_line.bbox[0], line.bbox[1], last_line.bbox[2], line.bbox[3])
+        last_line = line
+
+    cols.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
+    last_line = cols[0]
+    for line in cols[1:]:
+        if abs(line.bbox[0] - last_line.bbox[0]) <= threshold and line.bbox[0] != last_line.bbox[0]:
+            last_line.bbox = (line.bbox[0], last_line.bbox[1], line.bbox[2], last_line.bbox[3])
+        last_line = line
+    line_list = rows + cols
+    return line_list
+
+
 class LineTable:
     def recognize_table(self, list_textbox, list_line, sourceP_LB=False,
                         splited=False, from_pdf=False, is_reverse=False, show=0):
         self.list_line = list_line
-        self.list_crosspoints = self.recognize_crosspoints(list_line)
         self.from_pdf = from_pdf
         self.splited = splited
         self.connect_bbox_list = []
@@ -381,6 +429,13 @@ class LineTable:
             # 展示原始表格及文字
             self._plot(list_line, list_textbox, title='list_line,list_textbox')
 
+        list_line = align_table_lines(list_line)
+        if self.show:
+            self._plot(list_line, list_textbox, title='align_table_lines')
+
+        # 获取交点
+        self.list_crosspoints = self.recognize_crosspoints(list_line)
+
         # 聚类
         cluster_crosspoints = []
         for _point in self.list_crosspoints:
@@ -1189,6 +1244,15 @@ class LineTable:
 
     def fix_rect(self, _table, list_x, list_y, sourceP_LB, margin):
         self.fix_span(_table, list_x, list_y, sourceP_LB)
+        if self.show:
+            # 打印_table
+            temp_list = []
+            for t in _table:
+                print('------ fix_span row ------')
+                for c in t:
+                    print('fix_span col', c)
+                    temp_list.append(c)
+            self._plot([], [], temp_list, title='fix_span table')
 
         for _line in _table:
             _line.sort(key=lambda x: x.get('bbox')[0])
@@ -1646,7 +1710,7 @@ def sort_object(obj_list, is_reverse=False):
     if len(obj_list) == 0:
         return obj_list
     if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
-        obj_list.sort(key=lambda x: (x.y, x.x), reverse=is_reverse)
+        obj_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]), reverse=is_reverse)
         return obj_list
     elif isinstance(obj_list[0], _Page):
         obj_list.sort(key=lambda x: x.page_no)
@@ -2544,6 +2608,237 @@ def dynamic_get_port(start_port, mode='-1', num=10):
     return None
 
 
+def text_bbox_to_lt(text_list, bbox_list):
+    from format_convert.convert_tree import TextBox
+    lt_text_box_list = []
+    for i in range(len(bbox_list)):
+        bbox = bbox_list[i]
+        b_text = text_list[i]
+        lt_text_box_list.append(TextBox([bbox[0][0], bbox[0][1], bbox[2][0], bbox[2][1]], b_text))
+    return lt_text_box_list
+
+
+def extract_one_page_pdf(input_pdf_path, output_pdf_path, page_no):
+    try:
+        # 打开源PDF文件
+        with open(input_pdf_path, 'rb') as input_file:
+            pdf_reader = PyPDF2.PdfFileReader(input_file)
+
+            # 检查页码是否有效
+            if page_no < 0 or page_no >= len(pdf_reader.pages):
+                print("页码超出范围")
+                return
+
+            # 创建一个新的PDF写入对象
+            pdf_writer = PyPDF2.PdfFileWriter()
+
+            # 添加指定页到写入对象
+            pdf_writer.addPage(pdf_reader.pages[page_no])
+
+            # 将新的PDF写入到输出文件
+            with open(output_pdf_path, 'wb') as output_file:
+                pdf_writer.write(output_file)
+
+        print(f"成功提取第 {page_no + 1} 页并保存为 {output_pdf_path}")
+    except Exception as e:
+        print(f"提取页面失败:{e}")
+
+
+def get_gpu_memory_usage():
+    try:
+        # 初始化 NVML
+        pynvml.nvmlInit()
+        # 获取 GPU 设备数量
+        device_count = pynvml.nvmlDeviceGetCount()
+        # 获取当前时间
+        now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+
+        # 遍历每个 GPU
+        for i in range(device_count):
+            # 获取 GPU 句柄
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+
+            # 获取 GPU 名称
+            gpu_name = pynvml.nvmlDeviceGetName(handle)
+
+            # 获取显存信息
+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            total_memory = mem_info.total / (1024 * 1024)  # 转换为 MiB
+            used_memory = mem_info.used / (1024 * 1024)   # 转换为 MiB
+            free_memory = mem_info.free / (1024 * 1024)   # 转换为 MiB
+
+            info = f'  时间:{now}\n'
+            info += f"  GPU信息 {i}: {gpu_name.decode('utf-8')}\n"
+            info += f"    总显存: {total_memory:.2f} MiB\n"
+            info += f"    已用显存: {used_memory:.2f} MiB\n"
+            info += f"    剩余显存: {free_memory:.2f} MiB\n\n"
+
+            # 获取进程信息
+            processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
+            if processes:
+                info += f"  GPU进程信息: {i}\n"
+                for p in processes:
+                    pid = p.pid
+                    used_memory = p.usedGpuMemory / (1024 * 1024)
+                    try:
+                        # 获取进程的启动命令
+                        proc = psutil.Process(pid)
+                        cmdline = proc.cmdline()
+                        info += f"    {' '.join(cmdline)[-17:-14]} {pid}: {used_memory:.2f} MiB\n"
+                    except:
+                        traceback.print_exc()
+            print(info)
+
+        # 关闭 NVML
+        pynvml.nvmlShutdown()
+    except:
+        traceback.print_exc()
+        pass
+
+
+def get_current_process_gpu_id():
+    try:
+        # 初始化 NVML
+        pynvml.nvmlInit()
+
+        # 获取当前进程的 PID
+        current_pid = os.getpid()
+        # print(f"Current PID: {current_pid}")
+
+        # 获取 GPU 设备数量
+        device_count = pynvml.nvmlDeviceGetCount()
+
+        # 遍历每个 GPU 设备
+        for i in range(device_count):
+            # 获取 GPU 句柄
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+
+            # 获取运行在该 GPU 上的进程
+            try:
+                processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
+            except pynvml.NVMLError:
+                processes = []
+
+            # 查找当前进程
+            for p in processes:
+                if p.pid == current_pid:
+                    print(f"Process {current_pid} is running on GPU {i}")
+                    return i
+
+        print("Current process not found on any GPU")
+        return None
+    except:
+        traceback.print_exc()
+        return None
+    finally:
+        # 关闭 NVML
+        pynvml.nvmlShutdown()
+
+
+def register_all_fonts(font_dir):
+    # 遍历字体目录
+    for root, dirs, files in os.walk(font_dir):
+        for file in files:
+            # 检查文件扩展名是否为 TrueType 或 OpenType
+            if file.endswith((".ttf", ".otf")):
+                font_path = os.path.join(root, file)
+                # 提取字体名称(去掉扩展名)
+                font_name = os.path.splitext(file)[0]
+                try:
+                    # 注册字体
+                    pdfmetrics.registerFont(TTFont(font_name, font_path))
+                    print(f"Font registered: {font_name}")
+                except Exception as e:
+                    print(f"Failed to register font {font_name}: {e}")
+
+
+def ascii85_decode(data):
+    """
+    手动实现 ASCII85 解码
+    """
+    decoded = b''
+    i = 0
+    while i < len(data):
+        # ASCII85 编码以 '!' 开始,以 'z' 结束
+        if data[i] == ord('z'):
+            decoded += b'\0\0\0\0'
+            i += 1
+        else:
+            # 取 5 个字符进行解码
+            block = data[i:i+5]
+            i += 5
+            # 转换为整数值
+            value = 0
+            for c in block:
+                if ord('!') <= c <= ord('u'):
+                    value = value * 85 + (c - ord('!'))
+                elif c == ord('z'):
+                    value = 0
+                else:
+                    # 无效字符,跳过
+                    continue
+            # 转换为 4 个字节
+            bytes_value = value.to_bytes(4, byteorder='big')
+            decoded += bytes_value
+    return decoded
+
+
+def special_font_to_normal(text):
+    """
+    特殊中文转基本中文unicode
+
+    :return:
+    """
+    # print('type(extend_to_normal_dict)', type(extend_to_normal_dict), type(kangxi_to_normal_dict))
+    extend_set = set(extend_to_normal_dict.keys())
+    kangxi_set = set(kangxi_to_normal_dict.keys())
+    text_list = list(text)
+    for i, c in enumerate(text_list):
+        if c in extend_set:
+            text_list[i] = extend_to_normal_dict.get(c)
+        elif c in kangxi_set:
+            text_list[i] = kangxi_to_normal_dict.get(c)
+    text = ''.join(text_list)
+    return text
+
+
+def image_resize_by_ratio(img, max_width=1800, max_height=2600):
+    # 获取原图的宽度和高度
+    width, height = img.size
+    # print('width, height, max_width, max_height', width, height, max_width, max_height)
+
+    # 计算宽高比
+    aspect_ratio = width / height
+    # 判断哪条边超出最大值更多
+    if width > max_width and height > max_height:
+        # 计算宽度和高度超出最大值的比例
+        width_exceed_ratio = width / max_width
+        height_exceed_ratio = height / max_height
+
+        # 选择超出比例更大的边作为基准进行缩放
+        if width_exceed_ratio > height_exceed_ratio:
+            new_width = max_width
+            new_height = int(new_width / aspect_ratio)
+        else:
+            new_height = max_height
+            new_width = int(new_height * aspect_ratio)
+        # print('new_width, new_height1', new_width, new_height)
+    elif width > max_width:
+        new_width = max_width
+        new_height = int(new_width / aspect_ratio)
+        # print('new_width, new_height2', new_width, new_height)
+    elif height > max_height:
+        new_height = max_height
+        new_width = int(new_height * aspect_ratio)
+        # print('new_width, new_height3', new_width, new_height)
+    else:
+        new_width, new_height = width, height
+
+    if new_width != width or new_height != height:
+        img = img.resize((new_width, new_height), Image.LANCZOS)
+    return img
+
+
 if __name__ == "__main__":
     # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
     # print(slash_replace(strs))
@@ -2572,14 +2867,27 @@ if __name__ == "__main__":
 
     # print(parse_yaml())
 
-    print(get_ip_port())
+    # print(get_ip_port())
     # set_flask_global()
-    print(get_all_ip())
-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], "idc"))
-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], "atc"))
-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], "ocr"))
-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], 'convert', 'MASTER'))
+    # print(get_all_ip())
+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], "idc"))
+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], "atc"))
+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], "ocr"))
+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], 'convert', 'MASTER'))
     # print(get_args_from_config(get_ip_port(), "http://127.0.0.1", "gunicorn_path"))
     # print(get_intranet_ip())
-    # _path = "C:/Users/Administrator/Downloads/3.png"
-    # remove_red_seal(cv2.imread(_path))
+
+    # ps = glob(r'D:\Project\format_conversion_maxcompute\save_b_table_pdf\*.pdf')
+    # save_dir = r'D:\Project\format_conversion_maxcompute\save_b_table_pdf'
+    # index = 0
+    # for p in ps:
+    #     save_path = f'{save_dir}/e-{index}.pdf'
+    #     page_no = int(re.split('\.|-', p)[1])
+    #     extract_one_page_pdf(p, save_path, page_no)
+    #     index += 1
+
+    # _ss = 'otr_interface:app'
+    # print(_ss[-17:-14])
+
+    _ss = '仁和坪镇杨柳池村⼈居环境整治项⽬终⽌'
+    print(special_font_to_normal(_ss))

+ 3 - 0
monitor/watch_10_minutes_process.sh

@@ -1,3 +1,6 @@
 #!/bin/bash
 
 sed -n '/2024-05-29 17:30:00/,/2024-05-29 17:40:00/p' /convert.out | grep 'is_success' | wc -l
+
+
+sed -n '/2025-06-11 12:50:00/,/2025-06-11 13:00:00/p' /convert.out | grep 'is_success: ' | awk -F '[\\[\\] ]+' '{file_type=$(NF-2); time=$NF; map[file_type] += time; count[file_type]++} END {for (key in map) print key, "-", map[key], "-", count[key], "-", map[key]/count[key]}'

+ 8 - 26
ocr/ocr_interface.py

@@ -5,7 +5,7 @@ import multiprocessing as mp
 import socket
 import sys
 import os
-
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
 from PIL import Image
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
@@ -91,7 +91,10 @@ def picture2text(img_data, ocr_model, only_rec=0):
         text_list = []
         bbox_list = []
         if only_rec:
-            text_list = [results[0][0]]
+            if results:
+                text_list = [results[0][0]]
+            else:
+                text_list = []
             bbox_list = []
         else:
             for line in results:
@@ -176,27 +179,6 @@ def test_ocr_model(from_remote=True):
 
 
 if __name__ == '__main__':
-    test_ocr_model()
-
-#     src = """
-# data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAASwAAAAeCAYAAACWuCNnAAAE3ElEQVR42u2dQWjUQBSGi4gnEURERKQgIiIiggdP4sWDiPTgvSAK4sFDEe8iIohHEW8iUsSLFBERQaSI9CBIERGRgpQiHtpuMkl2Pa/vH2fW2ZhssttkO8v+AyHTzCT53nT23zdvZpOJIAh2TEhqNpu7VaS+THiWWq3WHuFaQV7279bW1rarWP2y5eRnYv8ZI36l1PF2u71N52P11V7Ap+QaKbwPGo3GPudv8jOx/4wLv5xw2+ajKDrhY4cT5Z23xkKJhfOG5KeSJDlEfib2H/L7Y2ysHotRp62xsp8J4/Bc2ljyM7H/kN+LFEbhGxipjU3C86LYiRy7R34m9h/ye2s0DJZtEsaK0aujFO8ZdX72H/Kz/zup3W5vsfmgGRyJ4/hkug7cSBnn3nHrkZ+J/Yf8/8agkZoXRfyOPKYoaxnnRuqHbMti5EH9N1Q4DE91KXQYHnWNJf+IxJDY/uQfFj8qQzn1tCSglLpfm8v4d2y7omJ1E4bLvWa7WKQRBjGW/JsrVmx/8lfOf/HSlbatZPO4gVTeKhUjZxx6FdOR6XPSKa8s77hc8wIM1flYfcD4VrYW6kOpJf9etre97plxzZ78lX4wc/jlG+WZy9/nNYfGX4tYsf3JXwP/f2KCfd4GY+H6bUSwsrb13+t7jaLOWaMxBkZZEAT7scfisrKChelSaaxvmE6F2ouhd8M4vObyV5ny+PHNAH6U28VxPvLX0IHZ/uSvhb8jIlZQ5GbTUvmyzkdqCSeZ/CrGoQBBsAzHAFRWsNz7ZNWT63+Se10P4uCsLbOBO5ev0FjDb66p+c0Yu4sf5Xn8Aw5/uvjtP63v62wSf2XtwPYnf438mQJjhOsJ9nLya5xkXb8sDynPe8oSpqxyd02G68FJfjHPKytovDnwgzvNL2WfcQ/TGAuVjN8z+E1+MUmSXQP884fKX0Psiu1P/lr482NYcpJUPuMMB/VJ5keKjwYZEmYNPdPlcAVdYXLXavQVwzL8thHT/HV9WMHf+W3UBtaabBZ/laLF9id/1fyd9RO9YlcG5LVZYj/Vy9MZ9LgrZJi+xB7rNoqE0F3/Iar8Aq5pl1IbxXf5G0njcI3xmwU7/Zrmz0q+8feb2P7kHyY/Mk/hHloxgAvnBshc7wqPh8BUY1pgsrynokB7nujBjRSmnWUmAYwhmh8zEln82ugUf82ehebv4xvJK/4BvlHZ/uQfGn/noIlbvbRDMkwpOrGkn0Y9Z/MC4Hkxq6LydF3X4ys71HRmJzQ/HlNh+U15h1/KDwzL4+jH/fWFf1D3ne1P/mHw60pZImIj9WYt1LH0MvpeolM09CsTQC8rWJZfjProuKVTll8HBTP4vfmwk5/85C/PLwUPpcItu/zdDAFnigLcvQRlIx5WmetjOhZKnMWvV9wKv88BafKTn/wb4O9aHBqraayhQMCsKLheVmjq8LDMbMcS8piJ0NO1WBjn8Lvupm+J/OQn/wD8GC+aRz9Muk/8w0mIzPuq0pia1UaTn/zkHx9+85D45bxAmGdGzpOf/OQfY34peI7f+kDpfDQWv0XSY1tRYROQe0V+8pN/TPn5miPyk5/8I8PP1xyRn/zkHxl+viaI/OQnP/mrMpavOSI/+ck/SomvOSI/+ck/cqLF1xyRn/zk9yLxNUfkJz/5i9If9M5atZCy5xcAAAAASUVORK5CYII=
-# """
-#
-#     image_data = src.split('data:image/png;base64,')[1]
-#
-#     # 解码 base64 字符串
-#     image_bytes = base64.b64decode(image_data)
-#
-#     # 将字节转换为图像
-#     # image = Image.open(io.BytesIO(image_bytes))
-#
-#     # image.show('img')
-#
-#     # with open(r'C:\Users\Administrator\Desktop\test_image\error16.jpg', 'rb') as f:
-#     #     image_bytes = f.read()
-#
-#     image = bytes2np(image_bytes)
-#
-#     cv2.imshow('img', image)
-#     cv2.imwrite('./1.png', image)
-#     cv2.waitKey(0)
+    # test_ocr_model()
+
+    app.run(host='127.0.0.1', port=17000, debug=False)

+ 6 - 2
ocr/ppocr/data/__init__.py

@@ -25,6 +25,9 @@ import signal
 import random
 
 __dir__ = os.path.dirname(os.path.abspath(__file__))
+
+from format_convert.utils import get_platform
+
 sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
 
 import copy
@@ -49,8 +52,9 @@ def term_mp(sig_num, frame):
     os.killpg(pgid, signal.SIGKILL)
 
 
-signal.signal(signal.SIGINT, term_mp)
-signal.signal(signal.SIGTERM, term_mp)
+if get_platform() != 'Windows':
+    signal.signal(signal.SIGINT, term_mp)
+    signal.signal(signal.SIGTERM, term_mp)
 
 
 def build_dataloader(config, mode, device, logger, seed=None):

+ 39 - 0
ocr/test_lock.py

@@ -0,0 +1,39 @@
+import multiprocessing
+import os
+import sys
+import time
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + '/../')
+from format_convert.utils import file_lock
+
+
+def run(a):
+    while True:
+        try:
+            time2 = time.time()
+            lock_file_sub = 'ocr'
+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
+            f = file_lock(lock_file)
+            print(os.getpid(),"get file_lock " + lock_file + " time ", time.time()-time2)
+            time2 = time.time()
+            time.sleep(2)
+            raise
+            print(os.getpid(), "sleep", time.time()-time2)
+
+
+        except Exception:
+            print('RuntimeError')
+        finally:
+            f.close()
+
+
+if __name__ == '__main__':
+    # 要处理的数据
+    data = [1, 2, 3]
+
+    # 创建进程池,指定进程数为 CPU 核心数
+    with multiprocessing.Pool(processes=3) as pool:
+        # 使用 map 方法分配任务并获取结果
+        results = pool.map(run, data)
+
+    # 输出结果
+    # print(results)

+ 36 - 52
ocr/tools/infer/predict_det_pytorch.py

@@ -19,7 +19,8 @@ import sys
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../../")
 import requests
 from format_convert import _global
-from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock
+from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock, \
+    get_gpu_memory_usage, get_current_process_gpu_id
 
 os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
 import cv2
@@ -120,6 +121,11 @@ class TextDetector(object):
         self.predictor.to(self.device)
         self.predictor.eval()
 
+        if str(self.device) != 'cpu':
+            self.gpu_id = get_current_process_gpu_id()
+        else:
+            self.gpu_id = None
+
         # self.predictor, self.input_tensor, self.output_tensors = utility.create_predictor(
         #     args, 'det', logger)  # paddle.jit.load(args.det_model_dir)
         # self.predictor.eval()
@@ -189,55 +195,44 @@ class TextDetector(object):
         shape_list = np.expand_dims(shape_list, axis=0)
         img = img.copy()
         starttime = time.time()
-
+        tensor = torch.from_numpy(img).float()
         # self.input_tensor.copy_from_cpu(img)
-        img = torch.from_numpy(img).float()
-        img = img.to(self.device)
-        try:
+        # if ori_im.shape[0] > 1024 and ori_im.shape[1] > 1024 and get_platform() != "Windows" and not MAX_COMPUTE:
+        if get_platform() != "Windows" and not MAX_COMPUTE and self.gpu_id is not None:
             # 加锁,防止太多大图片同时预测,爆显存
-            if ori_im.shape[0] > 1024 and ori_im.shape[1] > 1024 and get_platform() != "Windows" and not MAX_COMPUTE:
+            time2 = time.time()
+            lock_file_sub = f'ocr_{self.gpu_id}'
+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
+            f = file_lock(lock_file)
+            log("det get file_lock " + lock_file + " time " + str(time.time()-time2))
+
+            try:
                 time2 = time.time()
-                lock_file_sub = 'ocr'
-                lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
-                f = file_lock(lock_file)
-                log("get file_lock " + lock_file_sub + " time " + str(time.time()-time2))
+                if str(self.device) != 'cpu':
+                    torch.cuda.empty_cache()
+                tensor = tensor.to(self.device)
                 with torch.no_grad():
-                    out = self.predictor(img)
+                    out = self.predictor(tensor)
+                log("get file_lock run det" + " time " + str(time.time()-time2))
+            except RuntimeError:
+                log("ocr/tools/infer/predict_det.py predict.run error! maybe no gpu memory!")
+                log("det predictor shrink memory! ori_im.shape " + str(ori_im.shape))
+                get_gpu_memory_usage()
+                raise RuntimeError
+            finally:
                 f.close()
-            else:
-                with torch.no_grad():
-                    out = self.predictor(img)
-        except RuntimeError:
-            log("ocr/tools/infer/predict_det.py predict.run error! maybe no gpu memory!")
-            log("predictor shrink memory!")
-            # self.predictor.clear_intermediate_tensor()
-            # self.predictor.try_shrink_memory()
-            if str(self.device)!='cpu':
-                torch.cuda.empty_cache()
-                gc.collect()
-            raise RuntimeError
-
-        # outputs = []
-        # for output_tensor in self.output_tensors:
-        #     output = output_tensor.copy_to_cpu()
-        #     outputs.append(output)
-        out = out.cpu().numpy()
+                if str(self.device) != 'cpu':
+                    torch.cuda.empty_cache()
+                # gc.collect()
+        else:
+            tensor = tensor.to(self.device)
+            with torch.no_grad():
+                out = self.predictor(tensor)
 
+        out = out.cpu().numpy()
         preds = {}
         preds['maps'] = out
 
-        # if self.det_algorithm == "EAST":
-        #     preds['f_geo'] = outputs[0]
-        #     preds['f_score'] = outputs[1]
-        # elif self.det_algorithm == 'SAST':
-        #     preds['f_border'] = outputs[0]
-        #     preds['f_score'] = outputs[1]
-        #     preds['f_tco'] = outputs[2]
-        #     preds['f_tvo'] = outputs[3]
-        # elif self.det_algorithm == 'DB':
-        #     preds['maps'] = outputs[0]
-        # else:
-        #     raise NotImplementedError
         post_result = self.postprocess_op(preds, shape_list)
         dt_boxes = post_result[0]['points']
         if self.det_algorithm == "SAST" and self.det_sast_polygon:
@@ -246,17 +241,6 @@ class TextDetector(object):
             dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
         elapse = time.time() - starttime
 
-        # 释放内存
-        # print("TextDetector", self.predictor)
-        # if TextDetector.shrink_memory_count % 100 == 0:
-            # print("TextDetector shrink memory")
-        # self.predictor.clear_intermediate_tensor()
-        # self.predictor.try_shrink_memory()
-        # TextDetector.shrink_memory_count += 1
-        if str(self.device) != 'cpu':
-            torch.cuda.empty_cache()
-            # gc.collect()
-
         return dt_boxes, elapse
 
 

+ 188 - 173
ocr/tools/infer/predict_rec_pytorch.py

@@ -37,8 +37,9 @@ import ocr.tools.infer.utility as utility
 from ocr.ppocr.postprocess import build_post_process
 from ocr.ppocr.utils.logging import get_logger
 from ocr.ppocr.utils.utility import get_image_file_list, check_and_read_gif
-
-from format_convert.utils import judge_error_code, log, namespace_to_dict,get_platform
+from config.max_compute_config import MAX_COMPUTE
+from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock, \
+    get_gpu_memory_usage, get_current_process_gpu_id
 from format_convert import _global
 
 import torch
@@ -56,6 +57,8 @@ class TextRecognizer(object):
         self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
         self.character_type = args.rec_char_type
         self.rec_batch_num = args.rec_batch_num
+        self.rec_batch_num = 16
+        print('self.rec_batch_num', self.rec_batch_num)
         self.rec_algorithm = args.rec_algorithm
         postprocess_params = {
             'name': 'CTCLabelDecode',
@@ -64,23 +67,7 @@ class TextRecognizer(object):
             # "use_space_char": args.use_space_char
             "use_space_char": False
         }
-        # if self.rec_algorithm == "SRN":
-        #     postprocess_params = {
-        #         'name': 'SRNLabelDecode',
-        #         "character_type": args.rec_char_type,
-        #         "character_dict_path": args.rec_char_dict_path,
-        #         "use_space_char": args.use_space_char
-        #     }
-        # elif self.rec_algorithm == "RARE":
-        #     postprocess_params = {
-        #         'name': 'AttnLabelDecode',
-        #         "character_type": args.rec_char_type,
-        #         "character_dict_path": args.rec_char_dict_path,
-        #         "use_space_char": args.use_space_char
-        #     }
         self.postprocess_op = build_post_process(postprocess_params)
-        # self.predictor, self.input_tensor, self.output_tensors = \
-        #     utility.create_predictor(args, 'rec', logger)
 
         rec_model_path = args.rec_model_dir
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -100,19 +87,22 @@ class TextRecognizer(object):
         self.predictor.to(self.device)
         self.predictor.eval()
 
+        if str(self.device) != 'cpu':
+            self.gpu_id = get_current_process_gpu_id()
+        else:
+            self.gpu_id = None
+
     def resize_norm_img(self, img, max_wh_ratio):
         h, w = img.shape[:2]
         imgC, imgH, imgW = self.rec_image_shape
         assert imgC == img.shape[2]
         # print('max_wh_ratio', max_wh_ratio)
+        # max_wh_ratio h是w的10倍,直接返回
         if max_wh_ratio < 0.1:
-            # if h > imgW:
-            #     resized_image = cv2.resize(img, (w, imgW))
-            # else:
-            #     resized_image = img
-
-            # max_wh_ratio h是w的10倍,直接跳过
-            resized_w = None
+            # log('max_wh_ratio < 0.1', )
+            resized_image = img.astype('float32')
+            resized_image = resized_image.transpose((2, 0, 1)) / 255
+            return resized_image
         else:
             if self.character_type == "ch":
                 imgW = int((32 * max_wh_ratio))
@@ -138,186 +128,211 @@ class TextRecognizer(object):
             padding_im[:, :, 0:resized_w] = resized_image
         return padding_im
 
-    def resize_norm_img_srn(self, img, image_shape):
-        imgC, imgH, imgW = image_shape
-
-        img_black = np.zeros((imgH, imgW))
-        im_hei = img.shape[0]
-        im_wid = img.shape[1]
-
-        if im_wid <= im_hei * 1:
-            img_new = cv2.resize(img, (imgH * 1, imgH))
-        elif im_wid <= im_hei * 2:
-            img_new = cv2.resize(img, (imgH * 2, imgH))
-        elif im_wid <= im_hei * 3:
-            img_new = cv2.resize(img, (imgH * 3, imgH))
+    def predict(self, norm_img_batch):
+        tensor = torch.from_numpy(norm_img_batch).float()
+        # if norm_img.shape[3] >= 100 and get_platform() != "Windows" and not MAX_COMPUTE:
+        if get_platform() != "Windows" and not MAX_COMPUTE:
+            # 加锁
+            time2 = time.time()
+            lock_file_sub = 'ocr'
+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
+            f = file_lock(lock_file)
+            log("rec get file_lock " + lock_file + " time " + str(time.time()-time2))
+            try:
+                time2 = time.time()
+                if str(self.device) != 'cpu':
+                    torch.cuda.empty_cache()
+                tensor = tensor.to(self.device)
+                with torch.no_grad():
+                    out = self.predictor(tensor)
+                log("get file_lock run rec" + " time " + str(time.time()-time2))
+            except RuntimeError:
+                log("ocr/tools/infer/predict_rec.py predict.run error! maybe no gpu memory!")
+                log("rec predictor shrink memory! ori_im.shape " + str(norm_img_batch.shape))
+                get_gpu_memory_usage()
+                raise RuntimeError
+            finally:
+                f.close()
+                if str(self.device) != 'cpu':
+                    torch.cuda.empty_cache()
+                gc.collect()
         else:
-            img_new = cv2.resize(img, (imgW, imgH))
-
-        img_np = np.asarray(img_new)
-        img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
-        img_black[:, 0:img_np.shape[1]] = img_np
-        img_black = img_black[:, :, np.newaxis]
-
-        row, col, c = img_black.shape
-        c = 1
-
-        return np.reshape(img_black, (c, row, col)).astype(np.float32)
-
-    def srn_other_inputs(self, image_shape, num_heads, max_text_length):
-
-        imgC, imgH, imgW = image_shape
-        feature_dim = int((imgH / 8) * (imgW / 8))
-
-        encoder_word_pos = np.array(range(0, feature_dim)).reshape(
-            (feature_dim, 1)).astype('int64')
-        gsrm_word_pos = np.array(range(0, max_text_length)).reshape(
-            (max_text_length, 1)).astype('int64')
-
-        gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
-        gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
-            [-1, 1, max_text_length, max_text_length])
-        gsrm_slf_attn_bias1 = np.tile(
-            gsrm_slf_attn_bias1,
-            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
-
-        gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
-            [-1, 1, max_text_length, max_text_length])
-        gsrm_slf_attn_bias2 = np.tile(
-            gsrm_slf_attn_bias2,
-            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
-
-        encoder_word_pos = encoder_word_pos[np.newaxis, :]
-        gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
-
-        return [
-            encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
-            gsrm_slf_attn_bias2
-        ]
-
-    def process_image_srn(self, img, image_shape, num_heads, max_text_length):
-        norm_img = self.resize_norm_img_srn(img, image_shape)
-        norm_img = norm_img[np.newaxis, :]
-
-        [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
-            self.srn_other_inputs(image_shape, num_heads, max_text_length)
-
-        gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
-        gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
-        encoder_word_pos = encoder_word_pos.astype(np.int64)
-        gsrm_word_pos = gsrm_word_pos.astype(np.int64)
-
-        return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
-                gsrm_slf_attn_bias2)
+            tensor = tensor.to(self.device)
+            with torch.no_grad():
+                out = self.predictor(tensor)
+        # logging.info("ocr model predict time - rec" + str(time.time()-start_time))
+        out = out.cpu().numpy()
+        preds = out
+        return preds
+
+    def predict_batch(self, batch_list):
+        batch_out_list = []
+        if get_platform() != "Windows" and not MAX_COMPUTE and self.gpu_id is not None:
+            # 加锁
+            time2 = time.time()
+            lock_file_sub = f'ocr_{self.gpu_id}'
+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
+            f = file_lock(lock_file)
+            log("rec get file_lock " + lock_file + " time " + str(time.time()-time2))
+            try:
+                time2 = time.time()
+                if str(self.device) != 'cpu':
+                    torch.cuda.empty_cache()
+                for sub_batch_list in batch_list:
+                    sub_batch_out = []
+                    for tensor in sub_batch_list:
+                        with torch.no_grad():
+                            out = self.predictor(tensor)
+                            out = out.cpu().numpy()
+                        sub_batch_out.append(out)
+                    # sub_batch_out = np.concatenate(sub_batch_out, axis=0)
+                    batch_out_list.append(sub_batch_out)
+                log("get file_lock run rec" + " time " + str(time.time()-time2))
+
+            except RuntimeError:
+                log("ocr/tools/infer/predict_rec.py predict.run error! maybe no gpu memory!")
+                log("rec predictor shrink memory! ori_im.shape " + str(tensor.shape))
+                get_gpu_memory_usage()
+                raise RuntimeError
+            finally:
+                f.close()
+                if str(self.device) != 'cpu':
+                    torch.cuda.empty_cache()
+        else:
+            for sub_batch_list in batch_list:
+                sub_batch_out = []
+                for tensor in sub_batch_list:
+                    # print('tensor.shape', tensor.shape)
+                    with torch.no_grad():
+                        out = self.predictor(tensor)
+                        out = out.cpu().numpy()
+                    # print('out.shape', out.shape)
+                    sub_batch_out.append(out)
+                # sub_batch_out = np.concatenate(sub_batch_out, axis=0)
+                batch_out_list.append(sub_batch_out)
+
+        # 转为numpy
+        for bi, sub_batch_out in enumerate(batch_out_list):
+            batch_out_list[bi] = np.concatenate(sub_batch_out, axis=0)
+        return batch_out_list
 
     def __call__(self, img_list):
+        start_time = time.time()
+        # print('into TextRecognizer __call__')
         img_num = len(img_list)
-        # Calculate the aspect ratio of all text bars
+
+        # 过滤图片比例异常的
+        # print('rec len(img_list)', len(img_list))
+        temp_list = []
+        for img in img_list:
+            if img.shape[0] == 0 or img.shape[1] == 0 \
+                    or img.shape[0] >= 10000 or img.shape[1] >= 10000 \
+                    or img.shape[1] / img.shape[0] <= 0.5 \
+                    or img.shape[1] / img.shape[0] >= 100:
+                # print('rec img.shape[1] / img.shape[0] <= 0.5', img.shape)
+                continue
+            temp_list.append(img)
+        if not temp_list:
+            return None, 0
+        img_list = temp_list
+
+        # 按比例排序
         width_list = []
         i = 0
         for img in img_list:
-            # cv2.imwrite('D:/myProject/format_conversion_maxcompute/ocr/test/'+str(i)+'.jpg',img)
-            # i+=1
-            # cv2.imshow('img', img)
-            # cv2.waitKey(1000)
             width_list.append(img.shape[1] / float(img.shape[0]))
         # Sorting can speed up the recognition process
         indices = np.argsort(np.array(width_list))
 
+        # 分批预测
         # rec_res = []
         rec_res = [['', 0.0]] * img_num
         batch_num = self.rec_batch_num
         elapse = 0
+        batch_list = []
         for beg_img_no in range(0, img_num, batch_num):
             end_img_no = min(img_num, beg_img_no + batch_num)
             norm_img_batch = []
             max_wh_ratio = 0
+            # 取这个batch中比例最大的
             for ino in range(beg_img_no, end_img_no):
                 # h, w = img_list[ino].shape[0:2]
                 h, w = img_list[indices[ino]].shape[0:2]
                 wh_ratio = w * 1.0 / h
                 max_wh_ratio = max(max_wh_ratio, wh_ratio)
-            # print('max_wh_ratio',max_wh_ratio)
+            # print('max_wh_ratio', max_wh_ratio)
+
+            # resize image
             for ino in range(beg_img_no, end_img_no):
-                if self.rec_algorithm != "SRN":
-                    # print('max_wh_ratio', max_wh_ratio)
-                    norm_img = self.resize_norm_img(img_list[indices[ino]],
-                                                    max_wh_ratio)
-                    # cv2.imshow('img', norm_img.transpose(1,2,0))
-                    # cv2.waitKey(1000)
-                    norm_img = norm_img[np.newaxis, :]
-                    norm_img_batch.append(norm_img)
-                else:
-                    # norm_img = self.process_image_srn(
-                    #     img_list[indices[ino]], self.rec_image_shape, 8, 25)
-                    # encoder_word_pos_list = []
-                    # gsrm_word_pos_list = []
-                    # gsrm_slf_attn_bias1_list = []
-                    # gsrm_slf_attn_bias2_list = []
-                    # encoder_word_pos_list.append(norm_img[1])
-                    # gsrm_word_pos_list.append(norm_img[2])
-                    # gsrm_slf_attn_bias1_list.append(norm_img[3])
-                    # gsrm_slf_attn_bias2_list.append(norm_img[4])
-                    # norm_img_batch.append(norm_img[0])
-                    pass
+                # print('img_list[indices[ino]].shape', img_list[indices[ino]].shape)
+                norm_img = self.resize_norm_img(img_list[indices[ino]],
+                                                max_wh_ratio)
+                # print('norm_img.shape', norm_img.shape)
+                norm_img = norm_img[np.newaxis, :]
+                norm_img_batch.append(norm_img)
+
             norm_img_batch = np.concatenate(norm_img_batch)
             norm_img_batch = norm_img_batch.copy()
 
-            if self.rec_algorithm == "SRN":
-                # starttime = time.time()
-                # encoder_word_pos_list = np.concatenate(encoder_word_pos_list)
-                # gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list)
-                # gsrm_slf_attn_bias1_list = np.concatenate(
-                #     gsrm_slf_attn_bias1_list)
-                # gsrm_slf_attn_bias2_list = np.concatenate(
-                #     gsrm_slf_attn_bias2_list)
-                #
-                # inputs = [
-                #     norm_img_batch,
-                #     encoder_word_pos_list,
-                #     gsrm_word_pos_list,
-                #     gsrm_slf_attn_bias1_list,
-                #     gsrm_slf_attn_bias2_list,
-                # ]
-                # input_names = self.predictor.get_input_names()
-                # for i in range(len(input_names)):
-                #     input_tensor = self.predictor.get_input_handle(input_names[
-                #         i])
-                #     input_tensor.copy_from_cpu(inputs[i])
-                # self.predictor.run()
-                # outputs = []
-                # for output_tensor in self.output_tensors:
-                #     output = output_tensor.copy_to_cpu()
-                #     outputs.append(output)
-                # preds = {"predict": outputs[2]}
-                pass
+            # 预测
+            # starttime = time.time()
+            # # 当图片很长时,降低batch,防止爆内存
+            # # print('norm_img_batch.shape', norm_img_batch.shape)
+            # preds = []
+            # if norm_img_batch.shape[-1] >= 400:
+            #     if norm_img_batch.shape[-1] <= 1000:
+            #         mini_batch_size = 4
+            #     elif norm_img_batch.shape[-1] <= 3000:
+            #         mini_batch_size = 2
+            #     else:
+            #         mini_batch_size = 1
+            #     for bi in range(0, norm_img_batch.shape[0], mini_batch_size):
+            #         sub_batch = norm_img_batch[bi:bi+mini_batch_size]
+            #         sub_preds = self.predict(sub_batch)
+            #         preds.append(sub_preds)
+            #         # print('type(sub_preds), sub_preds.shape', type(sub_preds), sub_preds.shape)
+            #     preds = np.concatenate(preds, axis=0)
+            # else:
+            #     preds = self.predict(norm_img_batch)
+            # # print('type(preds), preds.shape', type(preds), preds.shape)
+            #
+            # # 后处理
+            # rec_result = self.postprocess_op(preds)
+            # for rno in range(len(rec_result)):
+            #     rec_res[indices[beg_img_no + rno]] = rec_result[rno]
+            # elapse += time.time() - starttime
+
+            # 根据长度,动态batch
+            if norm_img_batch.shape[-1] >= 400:
+                if norm_img_batch.shape[-1] <= 1000:
+                    mini_batch_size = 4
+                elif norm_img_batch.shape[-1] <= 3000:
+                    mini_batch_size = 2
+                else:
+                    mini_batch_size = 1
+                sub_batch_list = []
+                for bi in range(0, norm_img_batch.shape[0], mini_batch_size):
+                    sub_batch = norm_img_batch[bi:bi+mini_batch_size]
+                    tensor = torch.from_numpy(sub_batch).float()
+                    tensor = tensor.to(self.device)
+                    sub_batch_list.append(tensor)
             else:
-                starttime = time.time()
-
                 tensor = torch.from_numpy(norm_img_batch).float()
-                start_time = time.time()
                 tensor = tensor.to(self.device)
-                with torch.no_grad():
-                    out = self.predictor(tensor)
-                logging.info("ocr model predict time - rec" + str(time.time()-start_time))
-                out = out.cpu().numpy()
-                preds = out
+                sub_batch_list = [tensor]
 
-            # print("tools/infer/predict_rec preds", preds)
-            rec_result = self.postprocess_op(preds)
-            for rno in range(len(rec_result)):
-                # print("predict_rec", img_num, batch_num, beg_img_no,
-                #       indices[beg_img_no + rno], len(rec_res))
-                rec_res[indices[beg_img_no + rno]] = rec_result[rno]
-            elapse += time.time() - starttime
-            # 释放内存
-            # self.predictor.clear_intermediate_tensor()
-            # self.predictor.try_shrink_memory()
-
-            # gc.collect()
-            if str(self.device)!='cpu':
-                torch.cuda.empty_cache()
-            #     gc.collect()
+            batch_list.append(sub_batch_list)
+
+        # 预测
+        batch_out_list = self.predict_batch(batch_list)
+
+        # 后处理
+        for bi, out in enumerate(batch_out_list):
+            begin_img_no = bi * batch_num
+            rec_result = self.postprocess_op(out)
+            for ri in range(len(rec_result)):
+                rec_res[indices[begin_img_no + ri]] = rec_result[ri]
+        elapse += time.time() - start_time
         return rec_res, elapse
 
 

+ 106 - 45
ocr/tools/infer/predict_system.py

@@ -26,17 +26,19 @@ import copy
 import numpy as np
 import time
 from PIL import Image
+
 os.environ['FLAGS_eager_delete_tensor_gb'] = '0'
 import utility as utility
 # import ocr.tools.infer.predict_rec as predict_rec
-import ocr.tools.infer.predict_rec_pytorch as predict_rec # pytorch rec model
+import ocr.tools.infer.predict_rec_pytorch as predict_rec  # pytorch rec model
 # import ocr.tools.infer.predict_det as predict_det
-import ocr.tools.infer.predict_det_pytorch as predict_det # pytorch det model
+import ocr.tools.infer.predict_det_pytorch as predict_det  # pytorch det model
 import ocr.tools.infer.predict_cls as predict_cls
 from ocr.ppocr.utils.utility import get_image_file_list, check_and_read_gif
 from ocr.ppocr.utils.logging import get_logger
 from ocr.tools.infer.utility import draw_ocr_box_txt
-from format_convert.utils import has_intersection
+from format_convert.utils import has_intersection, log
+from format_convert import _global
 
 logger = get_logger()
 
@@ -61,27 +63,36 @@ class TextSystem(object):
         points[:, 0] = points[:, 0] - left
         points[:, 1] = points[:, 1] - top
         '''
-        img_crop_width = int(
-            max(
-                np.linalg.norm(points[0] - points[1]),
-                np.linalg.norm(points[2] - points[3])))
-        img_crop_height = int(
-            max(
-                np.linalg.norm(points[0] - points[3]),
-                np.linalg.norm(points[1] - points[2])))
-        pts_std = np.float32([[0, 0], [img_crop_width, 0],
-                              [img_crop_width, img_crop_height],
-                              [0, img_crop_height]])
-        M = cv2.getPerspectiveTransform(points, pts_std)
-        dst_img = cv2.warpPerspective(
-            img,
-            M, (img_crop_width, img_crop_height),
-            borderMode=cv2.BORDER_REPLICATE,
-            flags=cv2.INTER_CUBIC)
-        dst_img_height, dst_img_width = dst_img.shape[0:2]
-        # if dst_img_height * 1.0 / dst_img_width >= 1.5:
-        if dst_img_height * 1.0 / dst_img_width >= 2.0:
-            dst_img = np.rot90(dst_img)
+        # img_crop_width = int(
+        #     max(
+        #         np.linalg.norm(points[0] - points[1]),
+        #         np.linalg.norm(points[2] - points[3])))
+        # img_crop_height = int(
+        #     max(
+        #         np.linalg.norm(points[0] - points[3]),
+        #         np.linalg.norm(points[1] - points[2])))
+        # pts_std = np.float32([[0, 0], [img_crop_width, 0],
+        #                       [img_crop_width, img_crop_height],
+        #                       [0, img_crop_height]])
+        # M = cv2.getPerspectiveTransform(points, pts_std)
+        # dst_img = cv2.warpPerspective(
+        #     img,
+        #     M, (img_crop_width, img_crop_height),
+        #     borderMode=cv2.BORDER_REPLICATE,
+        #     flags=cv2.INTER_CUBIC)
+        # print('dst_img.shape', dst_img.shape)
+        #
+        # print('points', points)
+        w = abs(points[2][0] - points[0][0])
+        h = abs(points[2][1] - points[0][1])
+        dst_img = img[int(points[0][1]):int(points[0][1] + h), int(points[0][0]):int(points[0][0] + w), :]
+        # print('dst_img.shape2', dst_img.shape)
+        # cv2.imshow('dst_img', dst_img)
+        # cv2.waitKey(0)
+        # dst_img_height, dst_img_width = dst_img.shape[0:2]
+        # # if dst_img_height * 1.0 / dst_img_width >= 1.5:
+        # if dst_img_height * 1.0 / dst_img_width >= 2.0:
+        #     dst_img = np.rot90(dst_img)
         return dst_img
 
     def print_draw_crop_rec_res(self, img_crop_list, rec_res):
@@ -91,6 +102,7 @@ class TextSystem(object):
             logger.info(bno, rec_res[bno])
 
     def __call__(self, img):
+        # print('into TextSystem __call__')
         # cv2.imshow('img',img)
         # cv2.waitKey(0)
         ori_im = img.copy()
@@ -98,15 +110,65 @@ class TextSystem(object):
         logger.info("dt_boxes num : {}, elapse : {}".format(
             len(dt_boxes), elapse))
         if dt_boxes is None:
-            return None, None
-        img_crop_list = []
+            return [], []
 
-        dt_boxes = sorted_boxes(dt_boxes)
+        temp_list = []
+        # print('dt_boxes', type(dt_boxes))
+        # print('dt_boxes.shape', dt_boxes.shape)
+        # 过滤一些比例离谱的box
+        for b in dt_boxes:
+            w = b[2][0] - b[0][0]
+            h = b[2][1] - b[0][1]
+            if h == 0 or w == 0 \
+                    or h >= 10000 or w >= 10000 \
+                    or w / h <= 0.5 or w / h >= 100:
+                continue
+            temp_list.append(b)
+
+        if not temp_list:
+            return [], []
+        dt_boxes = np.array(temp_list)
+        # print('dt_boxes.shape2', dt_boxes.shape)
+
+        # show
+        # for b in dt_boxes:
+        #     p1 = [int(x) for x in b[0]]
+        #     p2 = [int(x) for x in b[2]]
+        #     cv2.rectangle(img, p1, p2, (0, 0, 255))
+        # cv2.namedWindow('img', cv2.WINDOW_NORMAL)
+        # cv2.imshow('img', img)
+        # cv2.waitKey(0)
+
+        # # 检测过多单字box,返回None
+        # if len(dt_boxes) >= 150:
+        #     short_box_cnt = 0
+        #     long_box_cnt = 0
+        #     for b in dt_boxes:
+        #         w = b[2][0] - b[0][0]
+        #         h = b[2][1] - b[0][1]
+        #         if w / h < 1.3:
+        #             short_box_cnt += 1
+        #         if w / h >= 3:
+        #             long_box_cnt += 1
+        #         print('dt_boxes', w, h, round(w/h, 3))
+        #     # print('short_box_cnt, len(dt_boxes)', short_box_cnt, len(dt_boxes))
+        #     log('short_box_cnt, long_box_cnt, len(dt_boxes) ' + str([short_box_cnt, long_box_cnt, len(dt_boxes)]))
+        #     if short_box_cnt >= 2/3 * len(dt_boxes) and long_box_cnt < 10:
+        #         # print('short_box_cnt >= 2/3 * len(dt_boxes), return None')
+        #         log('short_box_cnt >= 2/3 * len(dt_boxes), return None. ' + str([short_box_cnt, long_box_cnt, len(dt_boxes)]))
+        #         return [], []
 
+        img_crop_list = []
+        dt_boxes = sorted_boxes(dt_boxes)
         for bno in range(len(dt_boxes)):
             tmp_box = copy.deepcopy(dt_boxes[bno])
             img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
             img_crop_list.append(img_crop)
+        # print('system len(img_crop_list)', len(img_crop_list))
+        # for img in img_crop_list:
+        #     if img.shape[1] / img.shape[0] <= 0.5:
+        # print('system img.shape[1] / img.shape[0] <= 0.5', img.shape)
+
         if self.use_angle_cls:
             img_crop_list, angle_list, elapse = self.text_classifier(
                 img_crop_list)
@@ -131,6 +193,7 @@ class TextSystem(object):
                 filter_rec_res.append(rec_reuslt)
         return filter_boxes, filter_rec_res
 
+
 def boxex_points_fixup(dt_boxes):
     # 检查框全部转换为矩形
     # for i in range(len(dt_boxes)):
@@ -143,39 +206,37 @@ def boxex_points_fixup(dt_boxes):
     #     y_min = min(y_list)
     #     dt_boxes[i] = np.array([[x_min,y_min],[x_max,y_min],[x_max,y_max],[x_min,y_max]])
 
-
     for i in range(len(dt_boxes)):
         box1 = dt_boxes[i]
         box1_point3 = box1[2]
-        box1_point4 = box1[3] # 四边形底边的两点坐标
-        bottom_line = (min(box1_point3[0],box1_point4[0]),max(box1_point3[0],box1_point4[0]))
-        bottom_line_len = abs(bottom_line[1]-bottom_line[0])
+        box1_point4 = box1[3]  # 四边形底边的两点坐标
+        bottom_line = (min(box1_point3[0], box1_point4[0]), max(box1_point3[0], box1_point4[0]))
+        bottom_line_len = abs(bottom_line[1] - bottom_line[0])
 
-        for j in range(i+1,len(dt_boxes)):
+        for j in range(i + 1, len(dt_boxes)):
             box2 = dt_boxes[j]
             box2_point1 = box2[0]
-            box2_point2 = box2[1] # 四边形顶边的两点坐标
+            box2_point2 = box2[1]  # 四边形顶边的两点坐标
             top_line = (min(box2_point1[0], box2_point2[0]), max(box2_point1[0], box2_point2[0]))
-            top_line_len = abs(top_line[1]-top_line[0])
+            top_line_len = abs(top_line[1] - top_line[0])
             if has_intersection(box1, box2):  # 四边形框是否有交集
-                if not (min(top_line)>=max(bottom_line) or min(bottom_line)>=max(top_line)):  # x轴方向上有交集
+                if not (min(top_line) >= max(bottom_line) or min(bottom_line) >= max(top_line)):  # x轴方向上有交集
                     # 求重合部分y中间值
                     mid_y = ((box2_point1[1] + box2_point2[1]) / 2 + (box1_point3[1] + box1_point4[1]) / 2) // 2
                     if not mid_y:
                         continue
-                    max_line_len = max(bottom_line_len,top_line_len)
+                    max_line_len = max(bottom_line_len, top_line_len)
                     cross_line_len = bottom_line_len + top_line_len - \
-                                     (max(bottom_line[1],bottom_line[0],top_line[1],top_line[0]) - min(bottom_line[1],bottom_line[0],top_line[1],top_line[0]))
+                                     (max(bottom_line[1], bottom_line[0], top_line[1], top_line[0]) - min(
+                                         bottom_line[1], bottom_line[0], top_line[1], top_line[0]))
                     # print(cross_line_len,max_line_len,cross_line_len/max_line_len)
-                    if cross_line_len/max_line_len>=0.55: # 重合比例
-                        box1[2] = [box1_point3[0],mid_y]
-                        box1[3] = [box1_point4[0],mid_y]
-                        box2[0] = [box2_point1[0],mid_y]
-                        box2[1] = [box2_point2[0],mid_y]
+                    if cross_line_len / max_line_len >= 0.55:  # 重合比例
+                        box1[2] = [box1_point3[0], mid_y]
+                        box1[3] = [box1_point4[0], mid_y]
+                        box2[0] = [box2_point1[0], mid_y]
+                        box2[1] = [box2_point2[0], mid_y]
                         break
 
-
-
     return dt_boxes
 
 
@@ -247,4 +308,4 @@ def main(args):
 if __name__ == "__main__":
     main(utility.parse_args())
 
-    pass
+    pass

+ 1 - 0
start_and_stop/kill_convert.sh

@@ -0,0 +1 @@
+kill -9 $(lsof -i:15010|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')

+ 256 - 30
tika_/tika_interface.py

@@ -1,3 +1,5 @@
+import base64
+import io
 import json
 import os
 import re
@@ -7,8 +9,11 @@ import traceback
 from glob import glob
 
 import psutil
+from PIL import Image
+from bs4 import BeautifulSoup
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
+from config.max_compute_config import MAX_COMPUTE
 _dir = os.path.abspath(os.path.dirname(__file__))
 os.environ["TIKA_SERVER_JAR"] = _dir + "/files/tika-server.jar"
 os.environ["TIKA_LOG_PATH"] = _dir + "/log/"
@@ -16,12 +21,19 @@ os.environ["TIKA_PATH"] = _dir + "/files/"
 os.environ["TIKA_LOG_FILE"] = "tika.log"
 
 from format_convert import _global
-from format_convert.utils import log, request_post, dynamic_get_port
+from format_convert.utils import log, request_post, dynamic_get_port, get_platform
 import tika
 from tika import parser, config
 from tika.tika import runCommand
 from flask import Flask, request
 
+if get_platform() == "Windows":
+    FROM_REMOTE = False
+else:
+    FROM_REMOTE = True
+
+if MAX_COMPUTE:
+    FROM_REMOTE = False
 
 # 接口配置
 app = Flask(__name__)
@@ -46,18 +58,18 @@ def _tika():
         _md5 = request.form.get("md5")
         _global.update({"md5": _md5})
 
-        html = tika_interface(data).get('html')
-        return json.dumps({"html": html})
+        html = tika_interface(data).get('data')
+        return json.dumps({"data": html})
     except TimeoutError:
-        return json.dumps({"html": [-5]})
+        return json.dumps({"data": [-5]})
     except:
         traceback.print_exc()
-        return json.dumps({"html": [-1]})
+        return json.dumps({"data": [-1]})
     finally:
         log("tika interface finish time " + str(time.time()-start_time))
 
 
-def tika_interface(_path, show=1):
+def tika_interface(_path, show=0):
     try:
         # apache tika服务器 提取
         # text = runCommand('parse', 'all', _path, '9998', outDir='./files/')
@@ -67,7 +79,8 @@ def tika_interface(_path, show=1):
         if globals().get(key):
             port = globals().get(key)
         else:
-            port = dynamic_get_port(port)
+            if FROM_REMOTE:
+                port = dynamic_get_port(port)
             if port is None:
                 kill_tika_java_server()
                 # return {"html": [-19]}
@@ -76,31 +89,104 @@ def tika_interface(_path, show=1):
         url = 'http://localhost:' + str(port)
         log('tika ' + key + ' port: ' + str(port))
         parsed = parser.from_file(_path, xmlContent=True, serverEndpoint=url)
-        html = parsed.get('content')
-
-        # 处理html
-        html = html.split('\n')
-        temp_list = []
-        for line in html:
-            if '<meta' in line:
-                continue
-            temp_list.append(line)
-        html = temp_list
-        if len(html) <= 4:
-            return {"html": ''}
-
-        html = html[:2] + ['<meta charset="UTF-8">'] + html[2:]
-        html = '\n'.join(html)
-        html = re.sub('<table>', '<table border="1">', html)
-        html = re.sub(' class="正文"', '', html)
+        # print('parsed', parsed)
+        html = parsed.get('content', '')
 
+        # 提取html各种元素,其中图片只是一个映射
+        soup = BeautifulSoup(html, 'lxml')
+        tag_list = collect_soup_elements(soup)
         if show:
-            with open(_dir + '/doc.html', 'w', encoding='utf-8') as f:
-                f.write(html)
+            print('tag_list0', tag_list)
+
+        if not tag_list:
+            return {"data": tag_list}
+
+        # docx不是二进制,不能直接读二进制图片
+        if _path[-3:] == 'doc':
+            # 直接从二进制提取图片,保存在同一目录下
+            ss = re.split('[/\\\]', _path)
+            save_dir = os.sep.join(ss[:-1])
+            file_name = re.split('\.', ss[-1])[0]
+            if show:
+                print('save_dir', save_dir)
+                print('file_name', file_name)
+            image_path_dict = extract_images_from_doc(_path, save_dir)
+
+            if show:
+                print('image_path_dict', image_path_dict)
+
+            # embedded_images = re.findall(r'embedded:image[^"]+', html)
+            match_flag = 1
+            for tag in tag_list:
+                tag_name, value = tag
+                if tag_name != 'img':
+                    continue
+                # 提取图片文件名
+                image_name = file_name + '_' + re.sub('image', '', value)
+                if show:
+                    print('image_name', image_name)
+                # 保证所有image映射都对得上
+                real_image_path = image_path_dict.get(image_name)
+                if real_image_path is None:
+                    match_flag = 0
+                    break
+                else:
+                    tag[1] = real_image_path
+            if show:
+                print('match_flag', match_flag)
+
+            if match_flag:
+                # 图片数量能对上,则是正确的
+                pass
+            else:
+                # 图片对不上,则删除所有图片类型的tag
+                temp_list = []
+                for tag_name, value in tag_list:
+                    if tag_name == 'img':
+                        continue
+                    temp_list.append([tag_name, value])
+                tag_list = temp_list
+
+        elif _path[-4:] == 'docx':
+            temp_list = []
+            for tag_name, value in tag_list:
+                if tag_name == 'img':
+                    continue
+                temp_list.append([tag_name, value])
+            tag_list = temp_list
+
+
+        # # 处理html
+        # html = html.split('\n')
+        # temp_list = []
+        # for line in html:
+        #     if '<meta' in line:
+        #         continue
+        #     temp_list.append(line)
+        # html = temp_list
+        # if len(html) <= 4:
+        #     return {"html": ''}
+        #
+        # html = html[:2] + ['<meta charset="UTF-8">'] + html[2:]
+        # html = '\n'.join(html)
+        # html = re.sub('<table>', '<table border="1">', html)
+        # html = re.sub(' class="正文"', '', html)
+        #
+        # if show:
+        #     with open(_dir + '/doc.html', 'w', encoding='utf-8') as f:
+        #         f.write(html)
+    # except:
+    #     traceback.print_exc()
+    #     return {"html": [-17]}
+    # return {"html": html}
+
+        if show:
+            print('tag_list final', tag_list)
+
     except:
         traceback.print_exc()
-        return {"html": [-17]}
-    return {"html": html}
+        return {"data": [-17]}
+    return {"data": tag_list}
 
 
 def kill_tika_java_server():
@@ -122,6 +208,139 @@ def kill_tika_java_server():
             os.system(comm)
 
 
+def extract_images_from_doc(doc_file_path, output_folder):
+    # 定义图片格式相关的标志
+    image_signatures = {
+        'jpg': (b'\xFF\xD8', b'\xFF\xD9'),
+        'png': (b'\x89PNG', b'\x49\x45\x4E\x44\xAE\x42\x60\x82')
+    }
+
+    file_name = re.split('[/\\\.]', doc_file_path)[-2]
+
+    # 读取.doc文件
+    with open(doc_file_path, 'rb') as doc_file:
+        doc_data = doc_file.read()
+
+    output_file_path_dict = {}
+    # 查找并提取所有图片
+    for img_format, (start_sig, end_sig) in image_signatures.items():
+        start_index = 0
+        image_count = 1
+        while True:
+            # 查找图片起始位置
+            start_index = doc_data.find(start_sig, start_index)
+            if start_index == -1:
+                break
+
+            # 查找图片结束位置
+            end_index = doc_data.find(end_sig, start_index)
+            if end_index == -1:
+                break
+
+            # 提取图片数据
+            end_index += len(end_sig)  # 包含结束标志
+            image_data = doc_data[start_index:end_index]
+
+            # 保存图片
+            # image_count = len([f for f in os.listdir(output_folder) if f.endswith(f'.{img_format}')])
+            image_name = f'{file_name}_{image_count}.{img_format}'
+            image_path = os.path.join(output_folder, image_name)
+            with open(image_path, 'wb') as img_file:
+                img_file.write(image_data)
+            print(f'Saved {img_format} image to {image_path}')
+            output_file_path_dict[image_name] = image_path
+
+            # 继续查找下一个图片
+            start_index = end_index
+            image_count += 1
+    return output_file_path_dict
+
+
+def is_image_valid(image_path):
+    try:
+        # 尝试打开图片
+        with Image.open(image_path) as img:
+            # 如果图片可以打开并且没有问题,则 True返回
+            img.load()
+            return True
+    except:
+        # 如果出现异常,则返回 False
+        return False
+
+
+def is_image_data_valid(image_data):
+    """
+    判断图片数据流是否可以正常打开
+
+    Args:
+        image_data (bytes): 图片数据流
+
+    Returns:
+        bool: 如果图片数据流可以正常打开,则返回True,否则返回False
+    """
+    try:
+        # 将图片数据流转换为文件类对象
+        image_file = io.BytesIO(image_data)
+        # 尝试打开图片
+        with Image.open(image_file) as img:
+            # 如果图片可以打开并且没有问题,则返回True
+            img.load()
+            return True
+    except:
+        # 如果出现异常,则返回False
+        return False
+
+
+def collect_soup_elements(soup):
+    # elements = []
+    # # print('tags', tags)
+    # for tag in tags:
+    #     for element in tag.children:
+    #         print('element', element)
+    #         if element.name == 'img':
+    #             # 提取<img>标签的alt属性
+    #             alt_value = element.get('alt')
+    #             print(f"Image: {alt_value}")
+    #             elements.append(['img', alt_value])
+    #         elif element.name == 'table':
+    #             elements.append(['table', element])
+    #         elif element.string and element.string.strip():
+    #             # 提取文本内容
+    #             text = element.string.strip()
+    #             print(f"Text: {text}")
+    #             elements.append(['text', text])
+
+    table_tags = soup.find_all('table')
+    for table in table_tags:
+        table['border'] = "1"
+
+    elements = []
+    # 遍历所有标签
+    for element in soup.body.descendants:
+        if element.name == 'p':
+            # 提取文本
+            text = element.get_text(strip=True)
+            if text:
+                elements.append(['text', text])
+        elif element.name == 'img':
+            # 提取图片alt
+            alt = element.get('alt')
+            elements.append(['img', alt])
+        elif element.name == 'table':
+            # 提取表格数据
+            # table_data = []
+            # for row in element.find_all('tr'):
+            #     row_data = []
+            #     for cell in row.find_all('td'):
+            #         cell_text = cell.get_text(strip=True)
+            #         row_data.append(cell_text)
+            #     table_data.append(row_data)
+            for p_tag in element.find_all('p'):
+                p_tag.unwrap()
+            elements.append(['table', str(element)])
+    return elements
+
+
 def test_interface():
     # paths = glob("C:/Users/Administrator/Downloads/1716253106319.doc")
     paths = ["files/1716253106319.doc"]
@@ -153,6 +372,13 @@ if __name__ == "__main__":
     #     # _p = "C:/Users/Administrator/Downloads/1716253106319.doc"
     #     tika_interface(_p)
 
-    # app.run(host='0.0.0.0', port=5000)
+    # app.run(host='0.0.0.0', port=16050)
     # test_interface()
-    kill_tika_java_server()
+    # kill_tika_java_server()
+
+    # p = "C:/Users/Administrator/Desktop/test_wps/error1.wps"
+    # extract_images_from_doc(p, '.')
+
+    _p = "C:/Users/Administrator/Desktop/test_wps/error1.wps"
+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
+    c = tika_interface(_p)