пре 9 месеци · ab202ff1fc
--- a/botr/extract_table.py
+++ b/botr/extract_table.py
@@ -1,29 +1,37 @@
 
				+import copy
			
 
				+import math
			
 
				+import os
			
 
				 import re
			
 
				 import time
			
 
				 import traceback
			
 
				+from glob import glob
			
 
				+import numpy as np
			
 
				 import cv2
			
 
				+import wcwidth
			
 
				 from pdfminer.layout import LTLine
			
 
				 # from botr.nsp.predict import nsp_predict
			
 
				+from sklearn.cluster import KMeans
			
 
				+
			
 
				 from botr.rules.get_table_by_rules import get_table_by_rule
			
 
				 from botr.utils import line_iou, get_table_iou
			
 
				 from format_convert.convert_need_interface import from_yolo_interface
			
 
				-from format_convert.utils import log, np2bytes
			
 
				+from format_convert.utils import log, np2bytes, text_bbox_to_lt, pil_resize, memory_decorator
			
 
				 
			
 
				 
			
 
				 def b_table_process(list_line, list_text_boxes, list_cell, table_location):
			
 
				     def merge_textbox(textbox_list, in_objs):
			
 
				         delete_obj = []
			
 
				         threshold = 5
			
 
				-        textbox_list.sort(key=lambda x:x.bbox[0])
			
 
				+        textbox_list.sort(key=lambda x: x.bbox[0])
			
 
				         for k in range(len(textbox_list)):
			
 
				             tb1 = textbox_list[k]
			
 
				             if tb1 not in in_objs and tb1 not in delete_obj:
			
 
				-                for m in range(k+1, len(textbox_list)):
			
 
				+                for m in range(k + 1, len(textbox_list)):
			
 
				                     tb2 = textbox_list[m]
			
 
				                     if tb2 in in_objs:
			
 
				                         continue
			
 
				-                    if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
			
 
				-                            and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
			
 
				+                    if abs(tb1.bbox[1] - tb2.bbox[1]) <= threshold \
			
 
				+                            and abs(tb1.bbox[3] - tb2.bbox[3]) <= threshold:
			
 
				                         if tb1.bbox[0] <= tb2.bbox[0]:
			
 
				                             tb1.text = tb1.text + tb2.text
			
 
				                         else:
			
@@ -35,6 +43,7 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
 
				             if _obj in textbox_list:
			
 
				                 textbox_list.remove(_obj)
			
 
				         return textbox_list
			
 
				+
			
 
				     try:
			
 
				         if list_line:
			
 
				             from format_convert.convert_tree import TableLine
			
@@ -55,7 +64,7 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
 
				             current_y = area_list_text_boxes[0].bbox[1]
			
 
				             current_y2 = area_list_text_boxes[0].bbox[3]
			
 
				             # threshold = 2.
			
 
				-            threshold = max(2., 1/3 * abs(current_y2 - current_y))
			
 
				+            threshold = max(2., 1 / 3 * abs(current_y2 - current_y))
			
 
				             for t_b in area_list_text_boxes:
			
 
				                 bbox = t_b.bbox
			
 
				                 if current_y - threshold <= bbox[1] <= current_y + threshold:
			
@@ -69,6 +78,11 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
 
				             obj_in_table = []
			
 
				             table_dict = {'bbox': table_location}
			
 
				             row_list = []
			
 
				+
			
 
				+            # yolo检测出的表格，忽略两列的，因为已经补充了两列的新规则 250529
			
 
				+            if list_cell and len(list_cell[0]) == 2:
			
 
				+                return list_text_boxes, [], set()
			
 
				+
			
 
				             for row in list_cell:
			
 
				                 col_list = []
			
 
				                 for col in row:
			
@@ -112,17 +126,19 @@ def get_text_box_obj(_text_list, _bbox_list):
 
				     return _text_box_list
			
 
				 
			
 
				 
			
 
				-def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
			
 
				+def get_table(img, table_list, text_list, bbox_list, text_box_list, from_pdf=False, show=0):
			
 
				     log('start')
			
 
				     # 检测无边框表格
			
 
				     start_time_all = time.time()
			
 
				     start_time = time.time()
			
 
				     img_bytes = np2bytes(img)
			
 
				     b_table_list = from_yolo_interface(img_bytes)
			
 
				-    log('yolo detect cost: ' + str(time.time()-start_time))
			
 
				+    log('yolo detect cost: ' + str(time.time() - start_time))
			
 
				     b_table_list = b_table_list[0]
			
 
				     if not b_table_list:
			
 
				         log('detect not b_table_list')
			
 
				+        if from_pdf:
			
 
				+            save_b_table(img)
			
 
				         return [], [], []
			
 
				 
			
 
				     # if show:
			
@@ -156,8 +172,9 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
 
				         b_loc = [min_x, min_y, max_x, max_y, b_table[4]]
			
 
				         inter_flag = False
			
 
				         for table in table_list:
			
 
				-            loc = table.get('bbox')
			
 
				-            rows = table.get('table')
			
 
				+            # loc = table.get('bbox')
			
 
				+            loc = table.bbox
			
 
				+            # rows = table.get('table')
			
 
				             iou = line_iou([[0, loc[1]], [0, loc[3]]], [[0, b_loc[1]], [0, b_loc[3]]], axis=1)
			
 
				             if iou > 0.3:
			
 
				                 # if len(rows) <= 1:
			
@@ -190,7 +207,7 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
 
				             if b_loc1 in used_b_loc:
			
 
				                 continue
			
 
				             inter_flag = False
			
 
				-            for j in range(i+1, len(b_table_location_list)):
			
 
				+            for j in range(i + 1, len(b_table_location_list)):
			
 
				                 b_loc2 = b_table_location_list[j]
			
 
				                 iou = line_iou([[0, b_loc1[1]], [0, b_loc1[3]]], [[0, b_loc2[1]], [0, b_loc2[3]]], axis=1)
			
 
				                 if show:
			
@@ -230,7 +247,8 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
 
				 
			
 
				         # 根据ocr bbox，规则生成表格线
			
 
				         start_time = time.time()
			
 
				-        line_list, cell_list, table_location, bbox_text_dict = get_table_by_rule(img, area_text_list, area_bbox_list, b_loc, show=show)
			
 
				+        line_list, cell_list, table_location, bbox_text_dict = get_table_by_rule(img, area_text_list, area_bbox_list,
			
 
				+                                                                                 b_loc, show=show)
			
 
				         if not table_location:
			
 
				             log('get_table_by_rule not table_location')
			
 
				             continue
			
@@ -240,14 +258,15 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
 
				             area_bbox_list.append(eval(key))
			
 
				             area_text_list.append(bbox_text_dict.get(key))
			
 
				         b_text_box_list = get_text_box_obj(area_text_list, area_bbox_list)
			
 
				-        log('get_table_by_rule cost: ' + str(time.time()-start_time))
			
 
				+        log('get_table_by_rule cost: ' + str(time.time() - start_time))
			
 
				 
			
 
				         # 根据表格线生成单元格
			
 
				         start_time = time.time()
			
 
				-        b_text_box_list, _table_list, _obj_in_table_list = b_table_process(line_list, b_text_box_list, cell_list, table_location)
			
 
				+        b_text_box_list, _table_list, _obj_in_table_list = b_table_process(line_list, b_text_box_list, cell_list,
			
 
				+                                                                           table_location)
			
 
				         table_list += _table_list
			
 
				         obj_in_table_list += _obj_in_table_list
			
 
				-        log('b_table_process cost: ' + str(time.time()-start_time))
			
 
				+        log('b_table_process cost: ' + str(time.time() - start_time))
			
 
				 
			
 
				         # if not table_list:
			
 
				         #     log('table_process not table_list')
			
@@ -317,4 +336,2421 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
 
				         # _table_list[0]['table'] = new_table
			
 
				 
			
 
				     log('get_table finish ' + str(time.time() - start_time_all))
			
 
				-    return text_box_list, table_list, obj_in_table_list
			
 
				+    return text_box_list, table_list, obj_in_table_list
			
 
				+
			
 
				+
			
 
				+def save_b_table(image_np):
			
 
				+    _start_time = time.time()
			
 
				+    _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table_not_detect'
			
 
				+    # _path = 'D:/Project/format_conversion_maxcompute/save_b_table_not_detect'
			
 
				+    max_index = 20000
			
 
				+    if os.path.exists(_path):
			
 
				+        file_list = glob(_path + '/*')
			
 
				+        if file_list:
			
 
				+            file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
			
 
				+            file_index_list.sort(key=lambda x: x)
			
 
				+            index = file_index_list[-1] + 1
			
 
				+        else:
			
 
				+            index = 0
			
 
				+        if index > max_index:
			
 
				+            return
			
 
				+
			
 
				+        # 文件md5
			
 
				+        from format_convert import _global
			
 
				+        _md5 = _global.get("md5")
			
 
				+
			
 
				+        _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png'
			
 
				+        cv2.imwrite(_image_path, image_np)
			
 
				+        log('save yolo not detect b_table image success!')
			
 
				+
			
 
				+
			
 
				+@memory_decorator
			
 
				+def get_b_table_by_blank_colon(lt_text_list, table_list, layout_bbox, image_np=None, show=0):
			
 
				+    start_time = time.time()
			
 
				+
			
 
				+    # print('len(lt_text_list)', len(lt_text_list))
			
 
				+    # for lt_text in lt_text_list:
			
 
				+    #     print('lt_text', lt_text)
			
 
				+
			
 
				+    # 新增冒号提前判断
			
 
				+    colon_cnt = 0
			
 
				+    for lt_text in lt_text_list:
			
 
				+        if re.search('[：:]', lt_text.get_text()):
			
 
				+            colon_cnt += 1
			
 
				+    if colon_cnt <= 6:
			
 
				+        log('pre judge colon_cnt <= 6')
			
 
				+        return [], []
			
 
				+
			
 
				+    # 图片类型，限制lt_text_list个数，并且很多是单字的
			
 
				+    if image_np is not None and len(lt_text_list) >= 60:
			
 
				+        single_char_cnt = 0
			
 
				+        for lt_text in lt_text_list:
			
 
				+            if len(lt_text.get_text()) <= 1:
			
 
				+                single_char_cnt += 1
			
 
				+        # log('len(lt_text_list), single_char_cnt ' + str(len(lt_text_list)) + ' ' + str(single_char_cnt))
			
 
				+        if single_char_cnt > 50 or single_char_cnt > 1/3 * len(lt_text_list):
			
 
				+            return [], []
			
 
				+
			
 
				+    # raise
			
 
				+    # 有些确定为非表格，也输出，防止后续YOLO判断为表格，搞乱数据
			
 
				+    not_b_table_list = []
			
 
				+
			
 
				+    layout_h = int(layout_bbox[3])
			
 
				+    layout_w = int(layout_bbox[2])
			
 
				+
			
 
				+    if show:
			
 
				+        print('layout_w, layout_h', layout_w, layout_h)
			
 
				+        show_image = np.full((layout_h, layout_w, 3), 255, dtype=np.uint8)
			
 
				+
			
 
				+    if show and image_np is not None:
			
 
				+        image_np_show = copy.copy(image_np)
			
 
				+        for lt_text in lt_text_list:
			
 
				+            bbox = [int(x) for x in lt_text.bbox]
			
 
				+            cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
			
 
				+        cv2.imshow('image origin', image_np_show)
			
 
				+        cv2.waitKey(0)
			
 
				+
			
 
				+    # pdf类型预处理
			
 
				+    start_time1 = time.time()
			
 
				+    if image_np is None:
			
 
				+        # 把单个lt_text中，中间多个空格分割的分开
			
 
				+        lt_text_list = split_lt_text_by_many_space(lt_text_list)
			
 
				+
			
 
				+        if show:
			
 
				+            for lt_text in lt_text_list:
			
 
				+                bbox = [int(x) for x in lt_text.bbox]
			
 
				+                cv2.rectangle(show_image, bbox[:2], bbox[2:4], (0, 0, 255))
			
 
				+            cv2.imshow('pdf preprocess', show_image)
			
 
				+            cv2.waitKey(0)
			
 
				+        # log('get_b_table_by_blank_colon pdf preprocess cost: ' + str(time.time()-start_time1))
			
 
				+
			
 
				+    # 图片类型预处理
			
 
				+    start_time1 = time.time()
			
 
				+    if image_np is not None:
			
 
				+        # 删除空的
			
 
				+        start_time2 = time.time()
			
 
				+        lt_text_list = delete_empty_bbox(lt_text_list)
			
 
				+        # print('delete_empty_bbox cost: ', time.time()-start_time2)
			
 
				+
			
 
				+        # ocr识别的文本框需处理后紧贴文本，才能依靠空白分行
			
 
				+        start_time2 = time.time()
			
 
				+        new_bbox_list = shrink_bbox(image_np, [x.bbox for x in lt_text_list])
			
 
				+        # print('shrink_bbox cost: ', time.time()-start_time2)
			
 
				+        start_time2 = time.time()
			
 
				+        for i, lt_text in enumerate(lt_text_list):
			
 
				+            lt_text.bbox = new_bbox_list[i]
			
 
				+        # print('lt_text.bbox = new_bbox_list[i] cost: ', time.time()-start_time2)
			
 
				+        # log('get_b_table_by_blank_colon image preprocess1 cost: ' + str(time.time()-start_time1))
			
 
				+
			
 
				+    # 计算单字平均距离
			
 
				+    start_time1 = time.time()
			
 
				+    all_char_cnt = 0
			
 
				+    all_text_width = 0
			
 
				+    for lt_text in lt_text_list:
			
 
				+        all_char_cnt += len(lt_text.get_text())
			
 
				+        all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
			
 
				+    if all_char_cnt == 0:
			
 
				+        return [], not_b_table_list
			
 
				+    avg_char_width = all_text_width / all_char_cnt
			
 
				+
			
 
				+    # 图片类型预处理2
			
 
				+    if image_np is not None:
			
 
				+        # ocr识别的表格的值可能因空格分开，合并
			
 
				+        lt_text_list = merge_same_bbox(lt_text_list, avg_char_width)
			
 
				+
			
 
				+        # bbox交叉，修复
			
 
				+        lt_text_list = fix_cross_bbox(lt_text_list)
			
 
				+        # log('get_b_table_by_blank_colon image preprocess2 cost: ' + str(time.time()-start_time1))
			
 
				+
			
 
				+    if show and image_np is not None:
			
 
				+        image_np_show = copy.copy(image_np)
			
 
				+        for lt_text in lt_text_list:
			
 
				+            bbox = [int(x) for x in lt_text.bbox]
			
 
				+            cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
			
 
				+        cv2.imshow('image preprocess', image_np_show)
			
 
				+        cv2.waitKey(0)
			
 
				+
			
 
				+    if show:
			
 
				+        for lt_text in lt_text_list:
			
 
				+            print('lt_text', lt_text)
			
 
				+
			
 
				+    # 过滤xy值过大过小的
			
 
				+    temp_list = []
			
 
				+    for lt_text in lt_text_list:
			
 
				+        if min(lt_text.bbox) < 0 or max(lt_text.bbox) > 10000:
			
 
				+            continue
			
 
				+        temp_list.append(lt_text)
			
 
				+    lt_text_list = temp_list
			
 
				+
			
 
				+    if show:
			
 
				+        for lt_text in lt_text_list:
			
 
				+            cv2.rectangle(show_image,
			
 
				+                          (int(lt_text.bbox[0]), int(lt_text.bbox[1])),
			
 
				+                          (int(lt_text.bbox[2]), int(lt_text.bbox[3])),
			
 
				+                          (0, 0, 255)
			
 
				+                          )
			
 
				+        for table in table_list:
			
 
				+            cv2.rectangle(show_image,
			
 
				+                          (int(table.bbox[0]), int(table.bbox[1])),
			
 
				+                          (int(table.bbox[2]), int(table.bbox[3])),
			
 
				+                          (0, 255, 0)
			
 
				+                          )
			
 
				+
			
 
				+    # 计算单字平均距离
			
 
				+    all_char_cnt = 0
			
 
				+    all_text_width = 0
			
 
				+    for lt_text in lt_text_list:
			
 
				+        all_char_cnt += len(lt_text.get_text())
			
 
				+        all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
			
 
				+    if all_char_cnt == 0:
			
 
				+        return [], not_b_table_list
			
 
				+    avg_char_width = all_text_width / all_char_cnt
			
 
				+    if show:
			
 
				+        print('avg_char_width', avg_char_width)
			
 
				+
			
 
				+    if image_np is None:
			
 
				+        blank_width = 1 * avg_char_width
			
 
				+    else:
			
 
				+        blank_width = 1 * avg_char_width
			
 
				+    if show:
			
 
				+        print('blank_width', blank_width)
			
 
				+
			
 
				+    # 根据有边框表格位置，将该页分为多个区域
			
 
				+    table_h_list = []
			
 
				+    area_h_list = []
			
 
				+    area_start_h = 0
			
 
				+    table_list.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3]))
			
 
				+    for table in table_list:
			
 
				+        table_h_list.append([table.bbox[1], table.bbox[3]])
			
 
				+        area_h_list.append([area_start_h, table.bbox[1]])
			
 
				+        area_start_h = table.bbox[3]
			
 
				+    area_h_list.append([area_start_h, layout_h])
			
 
				+
			
 
				+    if show:
			
 
				+        for min_h, max_h in area_h_list:
			
 
				+            print('area_h_list', min_h, max_h)
			
 
				+            cv2.rectangle(show_image,
			
 
				+                          (0, int(min_h)),
			
 
				+                          (layout_w, int(max_h)),
			
 
				+                          (255, 0, 0)
			
 
				+                          )
			
 
				+
			
 
				+    lt_text_area_list = []
			
 
				+    for area_min_h, area_max_h in area_h_list:
			
 
				+        sub_area = []
			
 
				+        for lt_text in lt_text_list:
			
 
				+            if area_min_h <= lt_text.bbox[1] <= lt_text.bbox[3] <= area_max_h:
			
 
				+                sub_area.append(lt_text)
			
 
				+        lt_text_area_list.append(sub_area)
			
 
				+    if show:
			
 
				+        print('len(lt_text_area_list)', len(lt_text_area_list))
			
 
				+
			
 
				+    # 每个区域分别进行判断无边框表格
			
 
				+    result_table_list = []
			
 
				+    start_time1 = time.time()
			
 
				+    for sub_lt_text_list in lt_text_area_list:
			
 
				+        start_time2 = time.time()
			
 
				+        lt_text_row_list = get_text_row_by_blank(sub_lt_text_list, layout_h)
			
 
				+        # log('get_text_row_by_blank cost: ' + str(time.time()-start_time2))
			
 
				+
			
 
				+        # 有补充的占位lt_text,需添加到lt_text_list
			
 
				+        for row in lt_text_row_list:
			
 
				+            for lt_text in row:
			
 
				+                if lt_text not in lt_text_list:
			
 
				+                    lt_text_list.append(lt_text)
			
 
				+
			
 
				+        if show:
			
 
				+            for row in lt_text_row_list:
			
 
				+                print('row', row)
			
 
				+
			
 
				+        start_time2 = time.time()
			
 
				+        b_table_list1, b_table_bbox_list1 = get_b_table_by_lt_text_row(lt_text_row_list)
			
 
				+        # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))
			
 
				+
			
 
				+        # 确定区域后，对表格内重新分行，更精准
			
 
				+        start_time2 = time.time()
			
 
				+        table_lt_text_row_list = []
			
 
				+        for bi, b_table in enumerate(b_table_list1):
			
 
				+            b_table_bbox = b_table_bbox_list1[bi]
			
 
				+            sub_lt_text_list = []
			
 
				+            for lt_text in lt_text_list:
			
 
				+                if b_table_bbox[1] <= lt_text.bbox[1] <= lt_text.bbox[3] <= b_table_bbox[3]:
			
 
				+                    sub_lt_text_list.append(lt_text)
			
 
				+            _lt_text_row_list, center_blank_row = get_text_row_by_center_blank(b_table, sub_lt_text_list, blank_width,
			
 
				+                                                                               layout_h)
			
 
				+            table_lt_text_row_list += _lt_text_row_list
			
 
				+        # log('get_text_row_by_center_blank cost: ' + str(time.time()-start_time2))
			
 
				+
			
 
				+        start_time2 = time.time()
			
 
				+        b_table_list3, b_table_bbox_list3 = get_b_table_by_lt_text_row(table_lt_text_row_list)
			
 
				+        # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))
			
 
				+
			
 
				+        if show:
			
 
				+            for b_table in b_table_list3:
			
 
				+                print('b_table3', b_table)
			
 
				+
			
 
				+        # 对大致的表格进行列判断，表格内不同列的框不能交叉，可以重合，需有一定空白
			
 
				+        start_time2 = time.time()
			
 
				+        b_table_list2 = []
			
 
				+        for b_table in b_table_list3:
			
 
				+
			
 
				+            blank_row_list = get_blank_row(b_table, blank_width)
			
 
				+            if show:
			
 
				+                print('b_table get_blank_row b_table_list3', b_table)
			
 
				+                print('blank_row_list b_table_list3', blank_row_list)
			
 
				+
			
 
				+            b_table2 = []
			
 
				+            for bi, lt_text_row1 in enumerate(b_table[:-1]):
			
 
				+                lt_text_row2 = b_table[bi + 1]
			
 
				+                # if row1_row2_has_same_col(lt_text_row1, lt_text_row2):
			
 
				+                if row1_row2_has_same_blank(blank_row_list[bi], blank_row_list[bi + 1]):
			
 
				+                    if lt_text_row1 not in b_table2:
			
 
				+                        b_table2.append(lt_text_row1)
			
 
				+                    if lt_text_row2 not in b_table2:
			
 
				+                        b_table2.append(lt_text_row2)
			
 
				+                else:
			
 
				+                    # print('not cross blank', blank_row_list[bi], blank_row_list[bi + 1])
			
 
				+                    if len(b_table2) >= 2:
			
 
				+                        b_table_list2.append(b_table2)
			
 
				+                    b_table2 = []
			
 
				+            if len(b_table2) >= 2:
			
 
				+                b_table_list2.append(b_table2)
			
 
				+        # log('get_blank_row cost: ' + str(time.time()-start_time2))
			
 
				+
			
 
				+        if show:
			
 
				+            for b_table2 in b_table_list2:
			
 
				+                print('b_table2')
			
 
				+                for lt_text_row in b_table2:
			
 
				+                    print('b_table2 lt_text_row', lt_text_row)
			
 
				+
			
 
				+        start_time2 = time.time()
			
 
				+        for bi, b_table2 in enumerate(b_table_list2):
			
 
				+            # 根据冒号得到表格
			
 
				+            start_time3 = time.time()
			
 
				+            table2, center_blank_row, _not_b_table_bbox_list, table_bbox \
			
 
				+                = get_b_table_by_colon(b_table2, blank_width)
			
 
				+            log('get_b_table_by_colon cost: ' + str(time.time()-start_time3))
			
 
				+            not_b_table_list += [[[], x] for x in _not_b_table_bbox_list]
			
 
				+
			
 
				+            if show and center_blank_row:
			
 
				+                print('show center_blank_row', center_blank_row)
			
 
				+                bx = int((center_blank_row[2] + center_blank_row[0]) / 2)
			
 
				+                by = int((center_blank_row[3] + center_blank_row[1]) / 2)
			
 
				+                br = int((center_blank_row[2] - center_blank_row[0]) / 2)
			
 
				+                if br <= 5:
			
 
				+                    br = 5
			
 
				+                print('bx, by, br', bx, by, br)
			
 
				+                cv2.circle(show_image, (bx, by), br, (0, 255, 0))
			
 
				+
			
 
				+            if show:
			
 
				+                min_w, min_h, max_w, max_h = table_bbox
			
 
				+                cv2.rectangle(show_image,
			
 
				+                              (int(min_w), int(min_h)),
			
 
				+                              (int(max_w), int(max_h)),
			
 
				+                              (0, 255, 0)
			
 
				+                              )
			
 
				+
			
 
				+            # 修复最后一行跨行
			
 
				+            # table2 = fix_final_row(table2)
			
 
				+
			
 
				+            # 表格末尾有些只有一列的需补充
			
 
				+            table2 = add_last_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)
			
 
				+
			
 
				+            table2 = add_first_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)
			
 
				+
			
 
				+            # table格式转化
			
 
				+            table2 = table_list_to_dict(table2)
			
 
				+
			
 
				+            # 表格一些标准化，比如去掉占位符
			
 
				+            table2 = standard_table(table2)
			
 
				+
			
 
				+            if table2:
			
 
				+                result_table_list.append([table2, table_bbox])
			
 
				+        # log('colon, add, standard cost: ' + str(time.time()-start_time2))
			
 
				+
			
 
				+    # log('get_b_table_by_blank_colon area get b_table cost: ' + str(time.time()-start_time1))
			
 
				+
			
 
				+    if show:
			
 
				+        cv2.namedWindow("final result", cv2.WINDOW_NORMAL)
			
 
				+        cv2.resizeWindow("final result", 768, 1024)
			
 
				+        cv2.imshow('final result', show_image)
			
 
				+        cv2.waitKey(0)
			
 
				+
			
 
				+    if show:
			
 
				+        for table in result_table_list:
			
 
				+            print('get_b_table_by_bbox table ', table)
			
 
				+
			
 
				+        for not_table_bbox in not_b_table_list:
			
 
				+            print('not_table bbox ', not_table_bbox)
			
 
				+
			
 
				+    # log('get_b_table_by_blank_colon cost: ' + str(time.time()-start_time))
			
 
				+    return result_table_list, not_b_table_list
			
 
				+
			
 
				+
			
 
				+def get_b_table_by_lt_text_row(lt_text_row_list, show=0):
			
 
				+    # 先大致确定区域，列数大于2的区域
			
 
				+    b_table_list1 = []
			
 
				+    b_table = []
			
 
				+
			
 
				+    for lt_text_row in lt_text_row_list:
			
 
				+        if len(lt_text_row) >= 2:
			
 
				+            b_table.append(lt_text_row)
			
 
				+        else:
			
 
				+            if len(b_table) >= 2:
			
 
				+                b_table_list1.append(b_table)
			
 
				+            b_table = []
			
 
				+    if len(b_table) >= 2:
			
 
				+        b_table_list1.append(b_table)
			
 
				+
			
 
				+    # 获取bbox
			
 
				+    b_table_bbox_list = []
			
 
				+    for b_table in b_table_list1:
			
 
				+        x1 = min([y.bbox[0] for x in b_table for y in x])
			
 
				+        y1 = min([y.bbox[1] for x in b_table for y in x])
			
 
				+        x2 = max([y.bbox[2] for x in b_table for y in x])
			
 
				+        y2 = max([y.bbox[3] for x in b_table for y in x])
			
 
				+
			
 
				+        b_table_bbox_list.append([x1, y1, x2, y2])
			
 
				+
			
 
				+    if show:
			
 
				+        for b_table in b_table_list1:
			
 
				+            print('b_table')
			
 
				+            for lt_text_row in b_table:
			
 
				+                print('b_table lt_text_row', lt_text_row)
			
 
				+    return b_table_list1, b_table_bbox_list
			
 
				+
			
 
				+
			
 
				+def row1_row2_has_same_col(row1, row2):
			
 
				+    threshold = 5
			
 
				+    blank_len = 2
			
 
				+    cross_flag = 0
			
 
				+    for lt_text1 in row1:
			
 
				+        for lt_text2 in row2:
			
 
				+            if lt_text2.bbox[0] - lt_text1.bbox[2] >= blank_len \
			
 
				+                    or lt_text1.bbox[0] - lt_text2.bbox[2] >= blank_len \
			
 
				+                    or lt_text1.bbox[0] - threshold <= lt_text2.bbox[0] < lt_text2.bbox[2] <= lt_text1.bbox[
			
 
				+                2] + threshold \
			
 
				+                    or lt_text2.bbox[0] - threshold <= lt_text1.bbox[0] < lt_text1.bbox[2] <= lt_text2.bbox[
			
 
				+                2] + threshold:
			
 
				+                pass
			
 
				+            else:
			
 
				+                cross_flag = 1
			
 
				+    if cross_flag:
			
 
				+        return False
			
 
				+    else:
			
 
				+        return True
			
 
				+
			
 
				+
			
 
				+def get_blank_row(lt_text_row_list, blank_min_width, show=0):
			
 
				+    # 获取空白行
			
 
				+    blank_row_list = []
			
 
				+    # blank_min_width = avg_char_width * 3
			
 
				+    for lt_text_row in lt_text_row_list:
			
 
				+        lt_text_row.sort(key=lambda x: x.bbox[0])
			
 
				+        blank_row = []
			
 
				+        if len(lt_text_row) < 2:
			
 
				+            blank_row_list.append([])
			
 
				+        else:
			
 
				+            # 行内lt_text两两生成空白
			
 
				+            for lt_text1 in lt_text_row:
			
 
				+                sub_row = []
			
 
				+                for lt_text2 in lt_text_row:
			
 
				+                    if lt_text1 == lt_text2:
			
 
				+                        continue
			
 
				+                    # 必须从左到右
			
 
				+                    if lt_text1.bbox[2] > lt_text2.bbox[0]:
			
 
				+                        continue
			
 
				+                    line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
			
 
				+                    line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
			
 
				+                    if line_iou(line1, line2) > 0:
			
 
				+                        continue
			
 
				+                    sub_row.append([min(lt_text1.bbox[2], lt_text2.bbox[0]),
			
 
				+                                    min(lt_text1.bbox[3], lt_text2.bbox[1]),
			
 
				+                                    max(lt_text1.bbox[2], lt_text2.bbox[0]),
			
 
				+                                    max(lt_text1.bbox[3], lt_text2.bbox[1]),
			
 
				+                                    ])
			
 
				+                    if show:
			
 
				+                        print('sub_row', lt_text1.get_text(), lt_text2.get_text(), sub_row[-1])
			
 
				+
			
 
				+                # 每个lt_text只找出其对应的最小的空白
			
 
				+                if not sub_row:
			
 
				+                    continue
			
 
				+                sub_row.sort(key=lambda x: abs(x[0] - x[2]))
			
 
				+                if show:
			
 
				+                    print('sub_row[-1]', lt_text1.get_text(), sub_row[-1])
			
 
				+
			
 
				+                blank_row.append(sub_row[0])
			
 
				+
			
 
				+            # 判断最小距离，一行至少有一段空白大于最小距离
			
 
				+            match_flag = 0
			
 
				+            for r in blank_row:
			
 
				+                if abs(r[2] - r[0]) >= blank_min_width:
			
 
				+                    match_flag = 1
			
 
				+                    break
			
 
				+            if match_flag:
			
 
				+                blank_row_list.append(blank_row)
			
 
				+            else:
			
 
				+                blank_row_list.append([])
			
 
				+
			
 
				+    return blank_row_list
			
 
				+
			
 
				+
			
 
				+def row1_row2_has_same_blank(row1, row2):
			
 
				+    # row1的任一空白，都能和row2的任一空白相交
			
 
				+    cross_flag = 0
			
 
				+    for blank1 in row1:
			
 
				+        if cross_flag == 1:
			
 
				+            break
			
 
				+        for blank2 in row2:
			
 
				+            if blank1[0] <= blank2[0] <= blank1[2] \
			
 
				+                    or blank1[0] <= blank2[2] <= blank1[2] \
			
 
				+                    or blank2[0] <= blank1[0] <= blank2[2] \
			
 
				+                    or blank2[0] <= blank1[2] <= blank2[2]:
			
 
				+                cross_flag = 1
			
 
				+                break
			
 
				+
			
 
				+    if cross_flag:
			
 
				+        return True
			
 
				+    else:
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+@memory_decorator
			
 
				+def get_b_table_by_colon(b_table, blank_width, show=0):
			
 
				+    # print('into get_b_table_by_colon')
			
 
				+
			
 
				+    table_bbox = get_table_bbox(b_table)
			
 
				+
			
 
				+    # 有些确定为非表格，也输出，防止后续YOLO判断为表格，搞乱数据
			
 
				+    not_table_bbox_list = []
			
 
				+
			
 
				+    #
			
 
				+    # row_cnt_list = [len(x) in [2, 3, 4] for x in b_table]
			
 
				+
			
 
				+    # 所有行需是2列或4列，同一列算作一列
			
 
				+    row_cnt_list = []
			
 
				+    head_cnt_list = []
			
 
				+    for row in b_table:
			
 
				+        if not row:
			
 
				+            continue
			
 
				+        row.sort(key=lambda x: (x.bbox[0]))
			
 
				+        col_cnt = 1
			
 
				+        head_cnt = 0
			
 
				+        if re.search('[：:]', row[0].get_text()):
			
 
				+            head_cnt += 1
			
 
				+        for ci, col in enumerate(row):
			
 
				+            if ci == 0:
			
 
				+                continue
			
 
				+            col1 = row[ci - 1]
			
 
				+            col2 = row[ci]
			
 
				+            line1 = [(col1.bbox[0], 0), (col1.bbox[2], 0)]
			
 
				+            line2 = [(col2.bbox[0], 0), (col2.bbox[2], 0)]
			
 
				+            if line_iou(line1, line2) >= 0.5:
			
 
				+                continue
			
 
				+            else:
			
 
				+                col_cnt += 1
			
 
				+                if re.search('[：:]', col2.get_text()):
			
 
				+                    head_cnt += 1
			
 
				+        row_cnt_list.append(col_cnt in [2, 3, 4])
			
 
				+        head_cnt_list.append(head_cnt)
			
 
				+
			
 
				+    if show:
			
 
				+        print('row_cnt_list', row_cnt_list)
			
 
				+        print('head_cnt_list', head_cnt_list)
			
 
				+
			
 
				+    if max(head_cnt_list) > 2:
			
 
				+        if show:
			
 
				+            for row in b_table:
			
 
				+                print('head_cnt_list row', row)
			
 
				+        return [], None, not_table_bbox_list, table_bbox
			
 
				+
			
 
				+    # 最后一行年月日可能会影响列数，不是234列
			
 
				+    if row_cnt_list[-1] is False:
			
 
				+        row_cnt_list = row_cnt_list[:-1]
			
 
				+        b_table = b_table[:-1]
			
 
				+        table_bbox = get_table_bbox(b_table)
			
 
				+
			
 
				+    row_cnt_list = list(set(row_cnt_list))
			
 
				+    if not (len(row_cnt_list) == 1 and row_cnt_list[0] is True):
			
 
				+        return [], None, not_table_bbox_list, table_bbox
			
 
				+
			
 
				+    # 至少有2个以上文本包含冒号
			
 
				+    colon_cnt = 0
			
 
				+    for lt_text_row in b_table:
			
 
				+        for lt_text in lt_text_row:
			
 
				+            if re.search('[:：]', lt_text.get_text()) and re.search('[\u4e00-\u9fff]', lt_text.get_text()):
			
 
				+                colon_cnt += 1
			
 
				+    if show:
			
 
				+        print('colon_cnt, len(table)', colon_cnt, len(b_table))
			
 
				+    # if colon_cnt < 2:
			
 
				+    if colon_cnt < len(b_table) / 2:
			
 
				+        return [], None, not_table_bbox_list, table_bbox
			
 
				+
			
 
				+    blank_row_list = get_blank_row(b_table, blank_width)
			
 
				+    if show:
			
 
				+        print('b_table get_blank_row colon', b_table)
			
 
				+        print('blank_row_list colon', blank_row_list)
			
 
				+    # blank_row_list = [y for x in blank_row_list for y in x]
			
 
				+    # print('blank_row_list2', blank_row_list)
			
 
				+    # # 先选最长空白包含的所有空白
			
 
				+    # blank_row_list.sort(key=lambda x: abs(x[0]-x[2]), reverse=True)
			
 
				+    # max_blank = blank_row_list[0]
			
 
				+    # if show:
			
 
				+    #     print('max_blank', max_blank)
			
 
				+    # if abs(max_blank[0]-max_blank[2]) <= 4 * avg_char_width:
			
 
				+    #     return []
			
 
				+    # max_col = []
			
 
				+    # for blank_row_bbox in blank_row_list:
			
 
				+    #     if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
			
 
				+    #         max_col.append(blank_row_bbox)
			
 
				+    # if show:
			
 
				+    #     print('max_col', max_col)
			
 
				+    # if not max_col:
			
 
				+    #     return []
			
 
				+    # # 选取被包含最多的空白
			
 
				+    # blank_contain_cnt_dict = {}
			
 
				+    # for bi, blank_row_bbox in enumerate(max_col):
			
 
				+    #     blank_contain_cnt_dict[bi] = 0
			
 
				+    #     for blank_row_bbox2 in max_col:
			
 
				+    #         if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
			
 
				+    #             blank_contain_cnt_dict[bi] += 1
			
 
				+    # blank_contain_cnt_list = [[k, v] for k, v in blank_contain_cnt_dict.items()]
			
 
				+    # blank_contain_cnt_list.sort(key=lambda x: x[1])
			
 
				+    # if show:
			
 
				+    #     print('blank_contain_cnt_list', blank_contain_cnt_list)
			
 
				+    # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]
			
 
				+
			
 
				+    center_blank_row = choose_center_blank(blank_row_list, blank_width)
			
 
				+    if show:
			
 
				+        print('center_blank_row', center_blank_row)
			
 
				+
			
 
				+    # 获取中心最短的空白，作为参考
			
 
				+    # blank_list = [get_blank_row(x) for x in b_table]
			
 
				+    # blank_list = [x[0] if len(x) == 1 else x[1] for x in blank_list]
			
 
				+    # blank_list.sort(key=lambda x: abs(x[2] - x[0]))
			
 
				+    # center_blank = blank_list[0]
			
 
				+    #
			
 
				+    # print('center_blank', center_blank)
			
 
				+
			
 
				+    # 根据中心空白，分为两列
			
 
				+    # col_list1 = []
			
 
				+    # col_list2 = []
			
 
				+    # col_box_dict = {}
			
 
				+    # for lt_text_row in b_table:
			
 
				+    #     lt_text_row.sort(key=lambda x: x.bbox[0])
			
 
				+    #     # if len(lt_text_row) == 4:
			
 
				+    #     #     text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
			
 
				+    #     #     text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
			
 
				+    #     #     box1 = [
			
 
				+    #     #         min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
			
 
				+    #     #         max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
			
 
				+    #     #         min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
			
 
				+    #     #         max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
			
 
				+    #     #     ]
			
 
				+    #     #     box2 = [
			
 
				+    #     #         min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
			
 
				+    #     #         max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
			
 
				+    #     #         min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
			
 
				+    #     #         max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
			
 
				+    #     #     ]
			
 
				+    #     #
			
 
				+    #     #     # col_list1.append(text1)
			
 
				+    #     #     # col_list2.append(text2)
			
 
				+    #     # else:
			
 
				+    #     #     text1 = lt_text_row[0].get_text()
			
 
				+    #     #     text2 = lt_text_row[1].get_text()
			
 
				+    #     #     box1 = lt_text_row[0].bbox
			
 
				+    #     #     box2 = lt_text_row[1].bbox
			
 
				+    #
			
 
				+    #     left_col = []
			
 
				+    #     right_col = []
			
 
				+    #     for lt_text in lt_text_row:
			
 
				+    #         if lt_text.bbox[2] <= center_blank_row[0]:
			
 
				+    #             left_col.append(lt_text)
			
 
				+    #         else:
			
 
				+    #             right_col.append(lt_text)
			
 
				+    #
			
 
				+    #     left_text = [x.get_text() for x in left_col]
			
 
				+    #     left_text = ''.join(left_text)
			
 
				+    #     right_text = [x.get_text() for x in right_col]
			
 
				+    #     right_text = ''.join(right_text)
			
 
				+    #
			
 
				+    #     text1 = left_text.strip()
			
 
				+    #     text2 = right_text.strip()
			
 
				+    #
			
 
				+    #     # if text1 in col_box_dict.keys():
			
 
				+    #     #     col_box_dict[text1] += [box1]
			
 
				+    #     # else:
			
 
				+    #     #     col_box_dict[text1] = [box1]
			
 
				+    #     # if text2 in col_box_dict.keys():
			
 
				+    #     #     col_box_dict[text2] += [box2]
			
 
				+    #     # else:
			
 
				+    #     #     col_box_dict[text2] = [box2]
			
 
				+    #
			
 
				+    #     col_list1.append(text1)
			
 
				+    #     col_list2.append(text2)
			
 
				+    #
			
 
				+    # if show:
			
 
				+    #     print('col_list1', col_list1)
			
 
				+    #     print('col_list2', col_list2)
			
 
				+
			
 
				+    # col_key_value_list1 = []
			
 
				+    # last_key = ""
			
 
				+    # for col1 in col_list1:
			
 
				+    #     match = re.search('[:：]+', col1)
			
 
				+    #     # 有冒号的
			
 
				+    #     if match:
			
 
				+    #         key = col1[:match.end()]
			
 
				+    #         if last_key:
			
 
				+    #             key = last_key + key
			
 
				+    #             last_key = ""
			
 
				+    #         value = col1[match.end():]
			
 
				+    #         col_key_value_list1.append([key, value])
			
 
				+    #     # 没有冒号的
			
 
				+    #     else:
			
 
				+    #         # 如果该值也存在在col_list2里，则看做表头，和下一行的表头连在一起
			
 
				+    #         if col1 in col_list2:
			
 
				+    #             if show:
			
 
				+    #                 print('col1 in col_list2')
			
 
				+    #             last_key = col1
			
 
				+    #         # 不存在，则是上一行的值，和上一行的值连在一起
			
 
				+    #         else:
			
 
				+    #             if col_key_value_list1 and re.search('[:：]', col_key_value_list1[-1][1]):
			
 
				+    #                 col_key_value_list1[-1][1] += col1
			
 
				+    #             else:
			
 
				+    #                 col_key_value_list1.append(["", col1])
			
 
				+    #
			
 
				+    # if show:
			
 
				+    #     print('col_key_value_list1', col_key_value_list1)
			
 
				+    #
			
 
				+    # col_key_value_list2 = []
			
 
				+    # last_key = ""
			
 
				+    # for col2 in col_list2:
			
 
				+    #     match = re.search('[:：]+', col2)
			
 
				+    #     if match:
			
 
				+    #         key = col2[:match.end()]
			
 
				+    #         if last_key:
			
 
				+    #             key = last_key + key
			
 
				+    #             last_key = ""
			
 
				+    #         value = col2[match.end():]
			
 
				+    #         col_key_value_list2.append([key, value])
			
 
				+    #     else:
			
 
				+    #         # 如果该值也存在在col_list1里，则看做表头，和下一行的表头连在一起
			
 
				+    #         if col2 in col_list1:
			
 
				+    #             if show:
			
 
				+    #                 print('col2 in col_list1')
			
 
				+    #             last_key = col2
			
 
				+    #         # 不存在，则是上一行的值，和上一行的值连在一起
			
 
				+    #         else:
			
 
				+    #             if col_key_value_list2 and re.search('[:：]', col_key_value_list2[-1][1]):
			
 
				+    #                 col_key_value_list2[-1][1] += col2
			
 
				+    #             else:
			
 
				+    #                 col_key_value_list2.append(["", col2])
			
 
				+    #
			
 
				+    # if show:
			
 
				+    #     print('col_key_value_list2', col_key_value_list2)
			
 
				+
			
 
				+    if not center_blank_row:
			
 
				+        return [], None, not_table_bbox_list, table_bbox
			
 
				+
			
 
				+    # 根据中心空白，分为两列
			
 
				+    col_list1, col_list2 = divide_2_col_by_center_blank(b_table, center_blank_row)
			
 
				+    # 非表格，一般是那种一行里键值离的较远的单列，加入非表格，后续yolo判断也忽略
			
 
				+    if not col_list1 and not col_list2:
			
 
				+        not_table_bbox = get_table_bbox(b_table)
			
 
				+        not_table_bbox_list.append(not_table_bbox)
			
 
				+        return [], None, not_table_bbox_list, table_bbox
			
 
				+
			
 
				+    # 两列中，分别设置head value
			
 
				+    col_key_value_list1 = set_head_value_in_col(col_list1, col_list2)
			
 
				+    col_key_value_list2 = set_head_value_in_col(col_list2, col_list1)
			
 
				+
			
 
				+    # 根据两列head value，形成行
			
 
				+    b_table_row_list = []
			
 
				+    for i in range(max(len(col_key_value_list1), len(col_key_value_list2))):
			
 
				+        if i >= len(col_key_value_list1):
			
 
				+            col1 = ["", ""]
			
 
				+        else:
			
 
				+            col1 = col_key_value_list1[i]
			
 
				+        if i >= len(col_key_value_list2):
			
 
				+            col2 = ["", ""]
			
 
				+        else:
			
 
				+            col2 = col_key_value_list2[i]
			
 
				+
			
 
				+        row = col1[:2] + col2[:2]
			
 
				+        b_table_row_list.append(row)
			
 
				+
			
 
				+    # 删除空白列
			
 
				+    # col_dict = {}
			
 
				+    # for row in b_table_row_list:
			
 
				+    #     for col_i, col in enumerate(row):
			
 
				+    #         if col_i in col_dict.keys():
			
 
				+    #             col_dict[col_i] += [col]
			
 
				+    #         else:
			
 
				+    #             col_dict[col_i] = [col]
			
 
				+    # delete_col_i = []
			
 
				+    # for col_i, cols in col_dict.items():
			
 
				+    #     cols = list(set(cols))
			
 
				+    #     if len(cols) == 1 and cols[0] == '':
			
 
				+    #         delete_col_i.append(col_i)
			
 
				+    #
			
 
				+    # temp_list = []
			
 
				+    # for row in b_table_row_list:
			
 
				+    #     new_col = []
			
 
				+    #     for col_i, col in enumerate(row):
			
 
				+    #         if col_i in delete_col_i:
			
 
				+    #             continue
			
 
				+    #         new_col.append(col)
			
 
				+    #     temp_list.append(new_col)
			
 
				+    # b_table_row_list = temp_list
			
 
				+
			
 
				+    # 去掉删除空白列
			
 
				+    # b_table_row_list = delete_blank_col(b_table_row_list)
			
 
				+
			
 
				+    # 修复因表头和值是同一列上下排列，导致的错位
			
 
				+    b_table_row_list = fix_head_value_match(b_table_row_list)
			
 
				+
			
 
				+    if show:
			
 
				+        print('b_table_row_list', b_table_row_list)
			
 
				+    return b_table_row_list, center_blank_row, not_table_bbox_list, table_bbox
			
 
				+
			
 
				+
			
 
				+@memory_decorator
			
 
				+def get_text_row_by_blank(lt_text_list, layout_h, show=0):
			
 
				+    if show:
			
 
				+        for lt_text_row in lt_text_list:
			
 
				+            print('lt_text_111', lt_text_row)
			
 
				+    lt_text_blank_list = get_up_down_blank(lt_text_list)
			
 
				+    lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
			
 
				+    if show:
			
 
				+        for lt_text_row in lt_text_row_list:
			
 
				+            print('lt_text_row', lt_text_row)
			
 
				+
			
 
				+    return lt_text_row_list
			
 
				+
			
 
				+
			
 
				+def get_text_row_by_center_blank(b_table, lt_text_list, blank_width, layout_h, show=0):
			
 
				+    # 获取行空白
			
 
				+    blank_row_list = get_blank_row(b_table, blank_width)
			
 
				+    if show:
			
 
				+        print('b_table get_blank_row center_blank', b_table)
			
 
				+        print('blank_row_list center_blank', blank_row_list)
			
 
				+
			
 
				+    # 获取中心空白
			
 
				+    center_blank_row = choose_center_blank(blank_row_list, blank_width)
			
 
				+    if show:
			
 
				+        print('center_blank_row center', center_blank_row)
			
 
				+    if not center_blank_row:
			
 
				+        return [], []
			
 
				+
			
 
				+    center_x = (center_blank_row[2] + center_blank_row[0]) / 2
			
 
				+
			
 
				+    lt_text_blank_list = get_up_down_blank(lt_text_list, center_x=center_x)
			
 
				+
			
 
				+    lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
			
 
				+
			
 
				+    if show:
			
 
				+        for lt_text_row in lt_text_row_list:
			
 
				+            print('lt_text_row center', lt_text_row)
			
 
				+
			
 
				+    return lt_text_row_list, center_blank_row
			
 
				+
			
 
				+
			
 
				+def table_list_to_dict(table):
			
 
				+    table_dict_list = []
			
 
				+    for row in table:
			
 
				+        new_row = []
			
 
				+        for col in row:
			
 
				+            col_dict = {
			
 
				+                'rowspan': 1,
			
 
				+                'columnspan': 1,
			
 
				+                'text': col
			
 
				+            }
			
 
				+            new_row.append(col_dict)
			
 
				+        table_dict_list.append(new_row)
			
 
				+    return table_dict_list
			
 
				+
			
 
				+
			
 
				+@memory_decorator
			
 
				+def get_up_down_blank(lt_text_list, center_x=None, show=0):
			
 
				+    # 根据文本上下的空白分行
			
 
				+    lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
			
 
				+    lt_text_blank_list = []
			
 
				+    for i in range(len(lt_text_list)):
			
 
				+        lt_text1 = lt_text_list[i]
			
 
				+        line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
			
 
				+        if center_x is not None:
			
 
				+            left_or_right1 = 0 if (lt_text1.bbox[0] + lt_text1.bbox[2]) / 2 <= center_x else 1
			
 
				+
			
 
				+        up_blank_list = []
			
 
				+        down_blank_list = []
			
 
				+        for j in range(len(lt_text_list)):
			
 
				+            lt_text2 = lt_text_list[j]
			
 
				+            if lt_text1 == lt_text2:
			
 
				+                continue
			
 
				+
			
 
				+            # 没有中间列分割
			
 
				+            if center_x is None:
			
 
				+                line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
			
 
				+                iou = line_iou(line1, line2)
			
 
				+                if lt_text2.bbox[1] > lt_text1.bbox[3] and iou > 0:
			
 
				+                    down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
			
 
				+                if lt_text2.bbox[3] < lt_text1.bbox[1] and iou > 0:
			
 
				+                    up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
			
 
				+                # if lt_text1.bbox[1] > lt_text2.bbox[3] and iou > 0:
			
 
				+                #     down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
			
 
				+                # if lt_text1.bbox[3] < lt_text2.bbox[1] and iou > 0:
			
 
				+                #     up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
			
 
				+            # 有中间列分割
			
 
				+            else:
			
 
				+                left_or_right2 = 0 if (lt_text2.bbox[0] + lt_text2.bbox[2]) / 2 <= center_x else 1
			
 
				+                if lt_text2.bbox[1] > lt_text1.bbox[3] and left_or_right1 == left_or_right2:
			
 
				+                    down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
			
 
				+                if lt_text2.bbox[3] < lt_text1.bbox[1] and left_or_right1 == left_or_right2:
			
 
				+                    up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
			
 
				+                # if lt_text1.bbox[1] > lt_text2.bbox[3] and left_or_right1 == left_or_right2:
			
 
				+                #     down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
			
 
				+                # if lt_text1.bbox[3] < lt_text2.bbox[1] and left_or_right1 == left_or_right2:
			
 
				+                #     up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
			
 
				+
			
 
				+        # 找不到的，空白设置为自身text高度
			
 
				+        text_h = abs(lt_text1.bbox[3] - lt_text1.bbox[1])
			
 
				+        if not up_blank_list:
			
 
				+            up_blank_list.append([max(0, lt_text1.bbox[1] - text_h), lt_text1.bbox[1]])
			
 
				+        if not down_blank_list:
			
 
				+            down_blank_list.append([lt_text1.bbox[3], lt_text1.bbox[3] + text_h])
			
 
				+
			
 
				+        down_blank = down_blank_list[0]
			
 
				+        up_blank = up_blank_list[-1]
			
 
				+
			
 
				+        if show:
			
 
				+            print('lt_text1.get_text()', lt_text1.get_text(), lt_text1.bbox)
			
 
				+            if center_x is not None:
			
 
				+                print('center_x', center_x)
			
 
				+            print('up_blank', up_blank)
			
 
				+            print('down_blank', down_blank)
			
 
				+
			
 
				+        lt_text_blank_list.append([lt_text1, up_blank, down_blank])
			
 
				+    return lt_text_blank_list
			
 
				+
			
 
				+
			
 
				+@memory_decorator
			
 
				+def filter_large_blank_row(lt_text_blank_list, layout_h, show=0):
			
 
				+    # 先过滤空白过大的，单独成行
			
 
				+    lt_text_row_list = []
			
 
				+    single_lt_text_list = []
			
 
				+    max_blank_h = layout_h / 6
			
 
				+    index = 0
			
 
				+    threshold = 20
			
 
				+    lt_text_blank_list.sort(key=lambda x: (x[0].bbox[1], x[0].bbox[0]))
			
 
				+    for lt_text1, up_blank1, down_blank1 in lt_text_blank_list:
			
 
				+        row = []
			
 
				+        # 空白高度大于一定值，单独一行
			
 
				+        match_flag = 0
			
 
				+        # 在最下方的lt_text，判断上空白
			
 
				+        if index >= len(lt_text_blank_list) - 4 \
			
 
				+                and abs(up_blank1[0] - up_blank1[1]) >= max_blank_h:
			
 
				+            if show:
			
 
				+                print('match single lt_text 1')
			
 
				+            match_flag = 1
			
 
				+        # 在最上方的lt_text，判断下空白
			
 
				+        elif index <= 2 \
			
 
				+                and abs(down_blank1[0] - down_blank1[1]) >= max_blank_h:
			
 
				+            if show:
			
 
				+                print('match single lt_text 2')
			
 
				+            match_flag = 1
			
 
				+        # 在中间的，上下一起判断
			
 
				+        elif 2 <= index <= len(lt_text_blank_list) - 4 \
			
 
				+                and abs(up_blank1[0] - down_blank1[1]) >= max_blank_h:
			
 
				+            # 判断没有同行的
			
 
				+            has_same_row_flag = 0
			
 
				+            for lt_text2, _, _ in lt_text_blank_list:
			
 
				+                if lt_text1 == lt_text2:
			
 
				+                    continue
			
 
				+                if lt_text1.bbox[1] - threshold <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= lt_text1.bbox[3] + threshold:
			
 
				+                    has_same_row_flag = 1
			
 
				+                    break
			
 
				+            if has_same_row_flag:
			
 
				+                match_flag = 0
			
 
				+            else:
			
 
				+                match_flag = 1
			
 
				+            if show:
			
 
				+                print('match single lt_text 3')
			
 
				+
			
 
				+        if match_flag:
			
 
				+            row.append(lt_text1)
			
 
				+            lt_text_row_list.append(row)
			
 
				+            single_lt_text_list.append(lt_text1)
			
 
				+        index += 1
			
 
				+
			
 
				+    if show:
			
 
				+        print('single_lt_text_list', single_lt_text_list)
			
 
				+    return lt_text_row_list, single_lt_text_list
			
 
				+
			
 
				+
			
 
				+@memory_decorator
			
 
				+def get_contain_blank_row(lt_text_blank_list, layout_h, show=0):
			
 
				+    from format_convert.convert_tree import TextBox
			
 
				+    lt_text_row_list, single_lt_text_list = filter_large_blank_row(lt_text_blank_list, layout_h)
			
 
				+    single_lt_text_list = set(single_lt_text_list)
			
 
				+
			
 
				+    # 空白互相包含的就是同一行
			
 
				+    time1 = time.time()
			
 
				+    threshold = 5
			
 
				+    used_lt_text_list = set([])
			
 
				+    another_used_lt_text_list = set([])
			
 
				+    for i1 in range(len(lt_text_blank_list)):
			
 
				+        time2 = time.time()
			
 
				+        lt_text1, up_blank1, down_blank1 = lt_text_blank_list[i1]
			
 
				+        row = []
			
 
				+        if lt_text1 in single_lt_text_list:
			
 
				+            continue
			
 
				+        for i2 in range(len(lt_text_blank_list)):
			
 
				+            lt_text2, up_blank2, down_blank2 = lt_text_blank_list[i2]
			
 
				+            if lt_text1 == lt_text2:
			
 
				+                continue
			
 
				+            if lt_text2 in another_used_lt_text_list:
			
 
				+                continue
			
 
				+            if lt_text2 in used_lt_text_list and lt_text1.bbox[1] >= lt_text2.bbox[3]:
			
 
				+                continue
			
 
				+            if lt_text2 in single_lt_text_list:
			
 
				+                continue
			
 
				+
			
 
				+            # 单独上空白包含上空白，下空白包含下空白
			
 
				+            if (up_blank1[0] - threshold <= up_blank2[0] <= up_blank2[1] <= up_blank1[1] + threshold) \
			
 
				+                    or (down_blank1[0] - threshold <= down_blank2[0] <= down_blank2[1] <= down_blank1[1] + threshold):
			
 
				+                    # or (up_blank2[0] - threshold <= up_blank1[0] <= up_blank1[1] <= up_blank2[1] + threshold) \
			
 
				+                    # or (down_blank2[0] - threshold <= down_blank1[0] <= down_blank1[1] <= down_blank2[1] + threshold):
			
 
				+                if lt_text2 not in row:
			
 
				+                    row.append(lt_text2)
			
 
				+                    used_lt_text_list.add(lt_text2)
			
 
				+
			
 
				+            # 若是上下空白包含了另一个的文本部分，也成立
			
 
				+            # if up_blank1[0] <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= down_blank1[1]:
			
 
				+            #     if lt_text2 not in row:
			
 
				+            #         row.append(lt_text2)
			
 
				+            #         used_lt_text_list.append(lt_text2)
			
 
				+
			
 
				+
			
 
				+
			
 
				+        if lt_text1 not in row:
			
 
				+            row.append(lt_text1)
			
 
				+
			
 
				+        if show:
			
 
				+            print('get_contain_blank_row loop2 cost:', time.time()-time2)
			
 
				+
			
 
				+        # 若一个row中有3个带冒号的，说明误把一个单独行合进来了，分开
			
 
				+        time2 = time.time()
			
 
				+        colon_cnt = 0
			
 
				+        colon_lt_text = []
			
 
				+        for lt in row:
			
 
				+            if re.search('[:：]', lt.get_text()):
			
 
				+                colon_cnt += 1
			
 
				+                colon_lt_text.append(lt)
			
 
				+        if colon_cnt >= 3:
			
 
				+            if show:
			
 
				+                print('colon_cnt >= 3 row', row)
			
 
				+
			
 
				+            another_lt_text_list = find_outline_lt_text(row)
			
 
				+
			
 
				+            # # 把y最大的lt_text单独放一行
			
 
				+            # colon_lt_text.sort(key=lambda x: x.bbox[1])
			
 
				+            # # 除了前两个，其他都单放一行
			
 
				+            # another_lt_text_list = colon_lt_text[2:]
			
 
				+            for lt_text in another_lt_text_list:
			
 
				+                if lt_text in row:
			
 
				+                    row.remove(lt_text)
			
 
				+                if lt_text in colon_lt_text:
			
 
				+                    colon_lt_text.remove(lt_text)
			
 
				+
			
 
				+            if show:
			
 
				+                print('another_lt_text_list', another_lt_text_list)
			
 
				+                print('colon_lt_text', colon_lt_text)
			
 
				+
			
 
				+            if not colon_lt_text:
			
 
				+                continue
			
 
				+
			
 
				+            colon_lt_text.sort(key=lambda x: x.bbox[0])
			
 
				+            lt_text_row_list.append(row)
			
 
				+            for another_lt_text in another_lt_text_list:
			
 
				+                if abs(another_lt_text.bbox[0] - colon_lt_text[0].bbox[0]) > abs(
			
 
				+                        another_lt_text.bbox[0] - colon_lt_text[-1].bbox[0]):
			
 
				+                    new_bbox = [colon_lt_text[0].bbox[0], another_lt_text.bbox[1],
			
 
				+                                colon_lt_text[0].bbox[2], another_lt_text.bbox[3]]
			
 
				+                    another_row = [TextBox(text="@@:", bbox=new_bbox), another_lt_text]
			
 
				+                else:
			
 
				+                    new_bbox = [colon_lt_text[-1].bbox[0], another_lt_text.bbox[1],
			
 
				+                                colon_lt_text[-1].bbox[2], another_lt_text.bbox[3]]
			
 
				+                    # 新增一列占位
			
 
				+                    another_row = [another_lt_text, TextBox(text="@@:", bbox=new_bbox)]
			
 
				+                if show:
			
 
				+                    print('another_row', another_row)
			
 
				+                for lt_text3 in another_row:
			
 
				+                    another_used_lt_text_list.add(lt_text3)
			
 
				+                lt_text_row_list.append(another_row)
			
 
				+        else:
			
 
				+            lt_text_row_list.append(row)
			
 
				+
			
 
				+        if show:
			
 
				+            print('get_contain_blank_row judge colon cost:', time.time()-time2)
			
 
				+
			
 
				+    if show:
			
 
				+        print('get_contain_blank_row double loop cost: ', time.time()-time1)
			
 
				+
			
 
				+    # 去重
			
 
				+    lt_text_row_list.sort(key=lambda x: len(x), reverse=True)
			
 
				+    if show:
			
 
				+        for lt_text_row in lt_text_row_list:
			
 
				+            print('before dedup lt_text_row', lt_text_row)
			
 
				+
			
 
				+    lt_text_row_list = merge_intersecting_lists(lt_text_row_list)
			
 
				+
			
 
				+    if show:
			
 
				+        for lt_text_row in lt_text_row_list:
			
 
				+            print('after dedup lt_text_row', lt_text_row)
			
 
				+
			
 
				+    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
			
 
				+
			
 
				+    # 剔除全是空白的行
			
 
				+    temp_list = []
			
 
				+    for lt_text_row in lt_text_row_list:
			
 
				+        row_text = ""
			
 
				+        for lt_text in lt_text_row:
			
 
				+            row_text += lt_text.get_text()
			
 
				+        if re.sub('\s+', '', row_text) == "":
			
 
				+            continue
			
 
				+        temp_list.append(lt_text_row)
			
 
				+    lt_text_row_list = temp_list
			
 
				+    return lt_text_row_list
			
 
				+
			
 
				+
			
 
				+def choose_center_blank(blank_row_list, blank_width, show=0):
			
 
				+    if not blank_row_list:
			
 
				+        return []
			
 
				+
			
 
				+    # 先选最长空白包含的所有空白
			
 
				+    blank_list = [y for x in blank_row_list for y in x]
			
 
				+    if not blank_list:
			
 
				+        return []
			
 
				+
			
 
				+    blank_list.sort(key=lambda x: abs(x[0] - x[2]), reverse=True)
			
 
				+    max_blank = blank_list[0]
			
 
				+    if show:
			
 
				+        print('max_blank', max_blank)
			
 
				+    if abs(max_blank[0] - max_blank[2]) <= blank_width:
			
 
				+        return []
			
 
				+
			
 
				+    max_col = []
			
 
				+    for blank_row in blank_row_list:
			
 
				+        if not blank_row:
			
 
				+            continue
			
 
				+
			
 
				+        # # 找出每一行最大的空白列，但是同一列中则选列中最小的空白
			
 
				+        # # 空白分列
			
 
				+        # blank_row.sort(key=lambda x: (x[0], x[1]))
			
 
				+        # last_blank_bbox = blank_row[0]
			
 
				+        # blank_col = []
			
 
				+        # blank_col_list = []
			
 
				+        # for blank_bbox in blank_row[1:]:
			
 
				+        #     line1 = ([blank_bbox[0], 0], [blank_bbox[2], 0])
			
 
				+        #     line2 = ([last_blank_bbox[0], 0], [last_blank_bbox[2], 0])
			
 
				+        #     if line_iou(line1, line2) >= 0.7:
			
 
				+        #         blank_col += [blank_bbox, last_blank_bbox]
			
 
				+        #     else:
			
 
				+        #         blank_col.sort(key=lambda x: abs(x[2] - x[0]))
			
 
				+        #         blank_col_list.append(blank_col)
			
 
				+        #         blank_col = []
			
 
				+        #     last_blank_bbox = blank_bbox
			
 
				+
			
 
				+        # 选最大的列
			
 
				+        max_blank_bbox = blank_row[0]
			
 
				+        for blank_bbox in blank_row[1:]:
			
 
				+            if abs(blank_bbox[0] - blank_bbox[2]) > abs(max_blank_bbox[0] - max_blank_bbox[2]):
			
 
				+                max_blank_bbox = blank_bbox
			
 
				+
			
 
				+        if show:
			
 
				+            print('max_blank_bbox, blank_row', max_blank_bbox, blank_row)
			
 
				+
			
 
				+        line1 = ([max_blank[0], 0], [max_blank[2], 0])
			
 
				+        line2 = ([max_blank_bbox[0], 0], [max_blank_bbox[2], 0])
			
 
				+        iou = line_iou(line1, line2)
			
 
				+        # if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
			
 
				+        if iou >= 0.5:
			
 
				+            max_col.append(max_blank_bbox)
			
 
				+    if show:
			
 
				+        print('max_col', max_col)
			
 
				+    if not max_col:
			
 
				+        return []
			
 
				+
			
 
				+    # # 选取被包含最多的空白
			
 
				+    # # 选取交集最多的空白，相同数量则最短
			
 
				+    # blank_contain_cnt_dict = {}
			
 
				+    # for bi, blank_row_bbox in enumerate(max_col):
			
 
				+    #     blank_contain_cnt_dict[bi] = 0
			
 
				+    #     for blank_row_bbox2 in max_col:
			
 
				+    #         line1 = ([blank_row_bbox2[0], 0], [blank_row_bbox2[2], 0])
			
 
				+    #         line2 = ([blank_row_bbox[0], 0], [blank_row_bbox[2], 0])
			
 
				+    #         iou = line_iou(line1, line2)
			
 
				+    #         # if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
			
 
				+    #         if iou >= 0.2:
			
 
				+    #             blank_contain_cnt_dict[bi] += 1
			
 
				+    # blank_contain_cnt_list = [[k, v, abs(max_col[k][2] - max_col[k][0])/2] for k, v in blank_contain_cnt_dict.items()]
			
 
				+    # blank_contain_cnt_list.sort(key=lambda x: (x[1], -x[2]))
			
 
				+    # if show:
			
 
				+    #     print('blank_contain_cnt_list', blank_contain_cnt_list)
			
 
				+    # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]
			
 
				+
			
 
				+    # 选取交集部分
			
 
				+    center_blank_row = get_inter_part(max_col)
			
 
				+    return center_blank_row
			
 
				+
			
 
				+
			
 
				+def set_head_value_in_col(col_list1, col_list2, show=0):
			
 
				+    # 在列中设置 表头和值
			
 
				+    col_key_value_list = []
			
 
				+    last_key = ""
			
 
				+    for col1 in col_list1:
			
 
				+        match = re.search('[:：]+', col1)
			
 
				+        # 有冒号的
			
 
				+        if match:
			
 
				+            key = col1[:match.end()]
			
 
				+            if last_key:
			
 
				+                key = last_key + key
			
 
				+                last_key = ""
			
 
				+            value = col1[match.end():]
			
 
				+            col_key_value_list.append([key, value])
			
 
				+        # 没有冒号的
			
 
				+        else:
			
 
				+            # 如果该值也存在在col_list2里，则看做表头，和下一行的表头连在一起
			
 
				+            if col1 in col_list2:
			
 
				+                if show:
			
 
				+                    print('col1 in col_list2')
			
 
				+                # 若上一行也是无冒号的，直接加入一行
			
 
				+                if last_key:
			
 
				+                    col_key_value_list.append(["", last_key])
			
 
				+                    last_key = ''
			
 
				+                last_key = col1
			
 
				+            # 不存在，则是上一行的值，和上一行的值连在一起
			
 
				+            else:
			
 
				+                if col_key_value_list and re.search('[:：]', col_key_value_list[-1][1]):
			
 
				+                    col_key_value_list[-1][1] += col1
			
 
				+                else:
			
 
				+                    col_key_value_list.append(["", col1])
			
 
				+
			
 
				+    # 如果是最后一行没有冒号的，col1 col2都有的，直接当做一行
			
 
				+    if last_key:
			
 
				+        col_key_value_list.append(["", last_key])
			
 
				+
			
 
				+    if show:
			
 
				+        print('col_key_value_list', col_key_value_list)
			
 
				+
			
 
				+    return col_key_value_list
			
 
				+
			
 
				+
			
 
				+def divide_2_col_by_center_blank(b_table, center_blank_row, show=0):
			
 
				+    # 根据中心空白，分为两列
			
 
				+    col_list1 = []
			
 
				+    col_list2 = []
			
 
				+    col_box_dict = {}
			
 
				+    for lt_text_row in b_table:
			
 
				+        lt_text_row.sort(key=lambda x: x.bbox[0])
			
 
				+        # if len(lt_text_row) == 4:
			
 
				+        #     text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
			
 
				+        #     text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
			
 
				+        #     box1 = [
			
 
				+        #         min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
			
 
				+        #         max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
			
 
				+        #         min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
			
 
				+        #         max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
			
 
				+        #     ]
			
 
				+        #     box2 = [
			
 
				+        #         min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
			
 
				+        #         max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
			
 
				+        #         min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
			
 
				+        #         max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
			
 
				+        #     ]
			
 
				+        #
			
 
				+        #     # col_list1.append(text1)
			
 
				+        #     # col_list2.append(text2)
			
 
				+        # else:
			
 
				+        #     text1 = lt_text_row[0].get_text()
			
 
				+        #     text2 = lt_text_row[1].get_text()
			
 
				+        #     box1 = lt_text_row[0].bbox
			
 
				+        #     box2 = lt_text_row[1].bbox
			
 
				+
			
 
				+        left_col = []
			
 
				+        right_col = []
			
 
				+        for lt_text in lt_text_row:
			
 
				+            if (lt_text.bbox[2] + lt_text.bbox[0]) / 2 <= abs(center_blank_row[0] + center_blank_row[2]) / 2:
			
 
				+                left_col.append(lt_text)
			
 
				+            else:
			
 
				+                right_col.append(lt_text)
			
 
				+
			
 
				+        # 按阅读顺序排序
			
 
				+        left_col = sort_by_read_order(left_col)
			
 
				+        left_text = [x.get_text() for x in left_col]
			
 
				+        left_text = ''.join(left_text)
			
 
				+        right_col = sort_by_read_order(right_col)
			
 
				+        right_text = [x.get_text() for x in right_col]
			
 
				+        right_text = ''.join(right_text)
			
 
				+
			
 
				+        text1 = left_text.strip()
			
 
				+        text2 = right_text.strip()
			
 
				+
			
 
				+        col_list1.append(text1)
			
 
				+        col_list2.append(text2)
			
 
				+
			
 
				+    if show:
			
 
				+        print('col_list1', col_list1)
			
 
				+        print('col_list2', col_list2)
			
 
				+
			
 
				+    # 两列都必须有冒号，否则就是非2列表格
			
 
				+    colon_cnt1 = 0
			
 
				+    colon_cnt2 = 0
			
 
				+    for col in col_list1:
			
 
				+        if re.search('[：:]', col):
			
 
				+            colon_cnt1 += 1
			
 
				+    for col in col_list2:
			
 
				+        if re.search('[：:]', col):
			
 
				+            colon_cnt2 += 1
			
 
				+
			
 
				+    if colon_cnt1 < len(col_list1) / 3 or colon_cnt2 < len(col_list2) / 3:
			
 
				+        col_list1 = []
			
 
				+        col_list2 = []
			
 
				+        if show:
			
 
				+            print('col_list1 colon_cnt1 less', colon_cnt1)
			
 
				+            print('col_list2 colon_cnt2 less', colon_cnt2)
			
 
				+
			
 
				+    return col_list1, col_list2
			
 
				+
			
 
				+
			
 
				+def delete_blank_col(b_table_row_list):
			
 
				+    # 删除空白列
			
 
				+    col_dict = {}
			
 
				+    for row in b_table_row_list:
			
 
				+        for col_i, col in enumerate(row):
			
 
				+            if col_i in col_dict.keys():
			
 
				+                col_dict[col_i] += [col]
			
 
				+            else:
			
 
				+                col_dict[col_i] = [col]
			
 
				+    delete_col_i = []
			
 
				+    for col_i, cols in col_dict.items():
			
 
				+        cols = list(set(cols))
			
 
				+        if len(cols) == 1 and cols[0] == '':
			
 
				+            delete_col_i.append(col_i)
			
 
				+
			
 
				+    temp_list = []
			
 
				+    for row in b_table_row_list:
			
 
				+        new_col = []
			
 
				+        for col_i, col in enumerate(row):
			
 
				+            if col_i in delete_col_i:
			
 
				+                continue
			
 
				+            new_col.append(col)
			
 
				+        temp_list.append(new_col)
			
 
				+    b_table_row_list = temp_list
			
 
				+    return b_table_row_list
			
 
				+
			
 
				+
			
 
				+def fix_head_value_match(b_table, show=0):
			
 
				+    if not b_table:
			
 
				+        return b_table
			
 
				+    if len(b_table[0]) != 4:
			
 
				+        return b_table
			
 
				+    maybe_head_index = None
			
 
				+    match_head_value_dict = {}
			
 
				+    # 修复值跨行
			
 
				+    for row_i, row in enumerate(b_table):
			
 
				+        if maybe_head_index is None:
			
 
				+            if row[1] in ["", '@@:'] and row[3] in ["", '@@:']:
			
 
				+                match1 = re.search("[:：]", row[0])
			
 
				+                match2 = re.search("[:：]", row[2])
			
 
				+                if match1 and match2:
			
 
				+                    maybe_head_index = row_i
			
 
				+        else:
			
 
				+            if row[0] in ["", '@@:'] and row[2] in ["", '@@:'] and row[1] not in ["", '@@:'] and row[3] not in ["", '@@:']:
			
 
				+                if maybe_head_index in match_head_value_dict.keys():
			
 
				+                    match_head_value_dict[maybe_head_index] += [row_i]
			
 
				+                else:
			
 
				+                    match_head_value_dict[maybe_head_index] = [row_i]
			
 
				+            else:
			
 
				+                maybe_head_index = None
			
 
				+
			
 
				+    if show:
			
 
				+        print('match_head_value_dict', match_head_value_dict)
			
 
				+
			
 
				+    add_row_dict = {}
			
 
				+    delete_head_index_list = []
			
 
				+    delete_value_index_list = []
			
 
				+    for row_index, value_index_list in match_head_value_dict.items():
			
 
				+        head_row = b_table[row_index]
			
 
				+        delete_head_index_list.append(row_index)
			
 
				+        left_value_text = ""
			
 
				+        right_value_text = ""
			
 
				+        for value_index in value_index_list:
			
 
				+            value_row = b_table[value_index]
			
 
				+            delete_value_index_list.append(value_index)
			
 
				+            for col in value_row[:2]:
			
 
				+                left_value_text += col
			
 
				+            for col in value_row[2:]:
			
 
				+                right_value_text += col
			
 
				+        head_row[1] = left_value_text
			
 
				+        head_row[3] = right_value_text
			
 
				+        add_row_dict[row_index] = head_row
			
 
				+
			
 
				+    # 删掉原来的，加上新的row
			
 
				+    temp_list = []
			
 
				+    for row_i, row in enumerate(b_table):
			
 
				+        if row_i in delete_head_index_list:
			
 
				+            temp_list.append(add_row_dict.get(row_i))
			
 
				+            continue
			
 
				+        if row_i in delete_value_index_list:
			
 
				+            continue
			
 
				+        temp_list.append(row)
			
 
				+    b_table = temp_list
			
 
				+    return b_table
			
 
				+
			
 
				+
			
 
				+def add_last_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
			
 
				+                  table_lt_text_row_list, show=0):
			
 
				+    if not b_table:
			
 
				+        return b_table
			
 
				+    if len(b_table[0]) not in [4]:
			
 
				+        return b_table
			
 
				+
			
 
				+    blank_h_list = []
			
 
				+    max_h_list = []
			
 
				+    for lt_text_row in table_lt_text_row_list:
			
 
				+        if not lt_text_row:
			
 
				+            continue
			
 
				+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
			
 
				+        max_h_list.append(max_h)
			
 
				+    max_h_list.sort(key=lambda x: x)
			
 
				+    for i in range(1, len(max_h_list)):
			
 
				+        blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
			
 
				+    mean_blank_h = np.mean(blank_h_list)
			
 
				+    if show:
			
 
				+        print('add_last_rows blank_width_list', blank_h_list)
			
 
				+        print('add_last_rows mean_blank_h', mean_blank_h)
			
 
				+
			
 
				+    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
			
 
				+    match_row_list = []
			
 
				+    threshold = 5
			
 
				+    add_blank_h = mean_blank_h + threshold
			
 
				+    for li, lt_text_row in enumerate(lt_text_row_list):
			
 
				+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
			
 
				+        if show:
			
 
				+            print('max_h > table_bbox[3]', lt_text_row, max_h, table_bbox[3])
			
 
				+        # 高度需要在表格y2和y2加上空白的距离间
			
 
				+        if table_bbox[3] < max_h < table_bbox[3] + add_blank_h:
			
 
				+            # lt_text x轴上穿过了中心bbox，则跳过
			
 
				+            if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
			
 
				+                print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
			
 
				+                continue
			
 
				+
			
 
				+            # 左边需在表格x1和中心x1之间
			
 
				+            if table_bbox[0] - threshold <= min_w < center_blank_bbox[0]:
			
 
				+                match_row_list.append([lt_text_row, 0, max_h])
			
 
				+            # 右边需在表格x2和中心x2之间
			
 
				+            elif center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3:
			
 
				+                match_row_list.append([lt_text_row, 1, max_h])
			
 
				+            else:
			
 
				+                print('center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3')
			
 
				+                break
			
 
				+
			
 
				+            add_blank_h = add_blank_h + mean_blank_h + threshold
			
 
				+
			
 
				+    if show:
			
 
				+        print('add_last_rows match_row_list', match_row_list)
			
 
				+
			
 
				+    add_b_table = []
			
 
				+    real_max_h = None
			
 
				+    for mi, match_row in enumerate(match_row_list):
			
 
				+        lt_text_row, is_right, max_h = match_row
			
 
				+        lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
			
 
				+        # 只有一列
			
 
				+        if len(lt_text_row) == 1:
			
 
				+            text = lt_text_row[0].get_text()
			
 
				+            match = re.search('[:：]+', text)
			
 
				+            real_max_h = max_h
			
 
				+            if not match:
			
 
				+                head = ""
			
 
				+                value = text
			
 
				+            else:
			
 
				+                head = text[:match.end()]
			
 
				+                value = text[match.end():]
			
 
				+        # 或 两列，其实是表头由于空白被隔开
			
 
				+        elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
			
 
				+                and lt_text_row[1].get_text()[-1] in [':', "："]:
			
 
				+            text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
			
 
				+            head = text
			
 
				+            value = ''
			
 
				+        # 两列
			
 
				+        elif len(lt_text_row) == 2:
			
 
				+            text1 = lt_text_row[0].get_text()
			
 
				+            match = re.search('[:：]+', text1)
			
 
				+            if not match:
			
 
				+                break
			
 
				+            real_max_h = max_h
			
 
				+            head = text1
			
 
				+            value = lt_text_row[1].get_text()
			
 
				+        else:
			
 
				+            if show:
			
 
				+                print('add_last_rows len(lt_text_row) break', len(lt_text_row))
			
 
				+            break
			
 
				+
			
 
				+        # 获取上一行，可能需要将值补到上一行
			
 
				+        if mi == 0 or len(add_b_table) == 0:
			
 
				+            last_row = b_table[-1]
			
 
				+            last_flag = 0
			
 
				+        else:
			
 
				+            last_row = add_b_table[-1]
			
 
				+            last_flag = 1
			
 
				+
			
 
				+        if is_right:
			
 
				+            if last_row[2] and not last_row[3] and not head and value:
			
 
				+                b_table[-1][3] = value
			
 
				+                current_row = ["", "", last_row[2], value]
			
 
				+            else:
			
 
				+                current_row = ["", "", head, value]
			
 
				+        else:
			
 
				+            if last_row[0] and not last_row[1] and not head and value:
			
 
				+                current_row = [last_row[0], value, "", ""]
			
 
				+            else:
			
 
				+                current_row = [head, value, "", ""]
			
 
				+
			
 
				+        # if last_flag == 0:
			
 
				+        #     b_table = b_table[:-1]
			
 
				+        add_b_table.append(current_row)
			
 
				+
			
 
				+        if show:
			
 
				+            print('current_row', current_row)
			
 
				+
			
 
				+    if show:
			
 
				+        print('add_b_table', add_b_table)
			
 
				+
			
 
				+    b_table += add_b_table
			
 
				+    if real_max_h is not None:
			
 
				+        table_bbox[3] = real_max_h
			
 
				+    return b_table
			
 
				+
			
 
				+
			
 
				+def add_first_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
			
 
				+                   table_lt_text_row_list, show=0):
			
 
				+    if not b_table:
			
 
				+        return b_table
			
 
				+    if len(b_table[0]) not in [4]:
			
 
				+        return b_table
			
 
				+
			
 
				+    blank_h_list = []
			
 
				+    max_h_list = []
			
 
				+    for lt_text_row in table_lt_text_row_list:
			
 
				+        if not lt_text_row:
			
 
				+            continue
			
 
				+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
			
 
				+        max_h_list.append(max_h)
			
 
				+    max_h_list.sort(key=lambda x: x)
			
 
				+    for i in range(1, len(max_h_list)):
			
 
				+        blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
			
 
				+    mean_blank_h = np.mean(blank_h_list)
			
 
				+    if show:
			
 
				+        print('add_first_rows blank_width_list', blank_h_list)
			
 
				+        print('add_first_rows mean_blank_h', mean_blank_h)
			
 
				+
			
 
				+    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
			
 
				+    match_row_list = []
			
 
				+    threshold = 5
			
 
				+    add_blank_h = mean_blank_h + threshold
			
 
				+    for li, lt_text_row in enumerate(lt_text_row_list):
			
 
				+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
			
 
				+        if show:
			
 
				+            print('min_h < table_bbox[3]', lt_text_row, min_h, table_bbox[3])
			
 
				+        # 高度需要有一部分在在表格中
			
 
				+        if min_h <= table_bbox[1] < max_h:
			
 
				+            # lt_text x轴上穿过了中心bbox，则跳过
			
 
				+            if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
			
 
				+                print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
			
 
				+                continue
			
 
				+            # match_row_list.append([lt_text_row, 1, min_h])
			
 
				+
			
 
				+            # 中心x1左边
			
 
				+            if min_w < center_blank_bbox[0]:
			
 
				+                match_row_list.append([lt_text_row, 0, min_h])
			
 
				+            # 中心x2右边
			
 
				+            elif center_blank_bbox[2] < max_w:
			
 
				+                match_row_list.append([lt_text_row, 1, min_h])
			
 
				+            else:
			
 
				+                break
			
 
				+
			
 
				+    if show:
			
 
				+        print('add_first_rows match_row_list', match_row_list)
			
 
				+
			
 
				+    real_min_h = None
			
 
				+    for mi, match_row in enumerate(match_row_list):
			
 
				+        lt_text_row, is_right, min_h = match_row
			
 
				+        lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
			
 
				+        # 只有一列
			
 
				+        if len(lt_text_row) == 1:
			
 
				+            text = lt_text_row[0].get_text()
			
 
				+            match = re.search('[:：]+', text)
			
 
				+            real_min_h = min_h
			
 
				+            if not match:
			
 
				+                head = ""
			
 
				+                value = text
			
 
				+            else:
			
 
				+                head = text[:match.end()]
			
 
				+                value = text[match.end():]
			
 
				+        # # 或 两列，其实是表头由于空白被隔开
			
 
				+        # elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
			
 
				+        #         and lt_text_row[1].get_text()[-1] in [':', "："]:
			
 
				+        #     text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
			
 
				+        #     head = text
			
 
				+        #     value = ''
			
 
				+        # # 两列
			
 
				+        # elif len(lt_text_row) == 2:
			
 
				+        #     text1 = lt_text_row[0].get_text()
			
 
				+        #     match = re.search('[:：]+', text1)
			
 
				+        #     if not match:
			
 
				+        #         break
			
 
				+        #     real_max_h = max_h
			
 
				+        #     head = text1
			
 
				+        #     value = lt_text_row[1].get_text()
			
 
				+        else:
			
 
				+            if show:
			
 
				+                print('add_first_rows len(lt_text_row) break', len(lt_text_row))
			
 
				+            break
			
 
				+
			
 
				+        # 获取表格第一行，可能需要将值补进去
			
 
				+        if not head and value:
			
 
				+            if is_right:
			
 
				+                b_table[0][3] = value + b_table[0][3]
			
 
				+            else:
			
 
				+                b_table[0][1] = value + b_table[0][1]
			
 
				+
			
 
				+    if real_min_h is not None:
			
 
				+        table_bbox[1] = real_min_h
			
 
				+    return b_table
			
 
				+
			
 
				+
			
 
				+def get_row_bbox(row, mode='list'):
			
 
				+    # 提取所有x1, y1, x2, y2的值
			
 
				+
			
 
				+    if mode == 'list':
			
 
				+        x1_values = [x[0] for x in row]
			
 
				+        y1_values = [x[1] for x in row]
			
 
				+        x2_values = [x[2] for x in row]
			
 
				+        y2_values = [x[3] for x in row]
			
 
				+    elif mode == '.bbox':
			
 
				+        x1_values = [x.bbox[0] for x in row]
			
 
				+        y1_values = [x.bbox[1] for x in row]
			
 
				+        x2_values = [x.bbox[2] for x in row]
			
 
				+        y2_values = [x.bbox[3] for x in row]
			
 
				+
			
 
				+    min_x = min(x1_values)
			
 
				+    max_x = max(x2_values)
			
 
				+    min_y = min(y1_values)
			
 
				+    max_y = max(y2_values)
			
 
				+    return min_x, min_y, max_x, max_y
			
 
				+
			
 
				+
			
 
				+def shrink_bbox(img, bbox_list):
			
 
				+    def return_not_most_color_index(image_np, match_color):
			
 
				+        # 计算每个像素与背景色的欧几里得距离的平方
			
 
				+        diff = np.sum(np.sqrt((image_np.astype(np.int32) - match_color.astype(np.int32)) ** 2), axis=2)
			
 
				+        threshold = 100  # 假设阈值为 10000，可以调整
			
 
				+        diff_mask = diff > threshold
			
 
				+        # 获取与背景色相差较大的像素的索引
			
 
				+        diff_index = np.where(diff_mask)
			
 
				+        # print('diff_index.size', diff_index[0].size)
			
 
				+        return diff_index
			
 
				+
			
 
				+    def return_not_most_color_index_fast(image_np, match_color):
			
 
				+        # 将图像和匹配颜色转换为整数类型
			
 
				+        # image_int = image_np.astype(np.int32)
			
 
				+        # match_color_int = match_color.astype(np.int32)
			
 
				+
			
 
				+        # 计算每个像素与背景色的欧几里得距离的平方
			
 
				+        diff = np.sum((image_np - match_color) ** 2, axis=2)
			
 
				+        threshold = 20 # 假设阈值为 10000，可以调整
			
 
				+        threshold = threshold ** 2
			
 
				+        diff_mask = diff > threshold
			
 
				+        # 获取与背景色相差较大的像素的索引
			
 
				+        diff_index = np.where(diff_mask)
			
 
				+        # print('diff_index.size', diff_index[0].size)
			
 
				+        return diff_index
			
 
				+
			
 
				+
			
 
				+    # def count_colors_with_histogram(img):
			
 
				+    #     time00 = time.time()
			
 
				+    #
			
 
				+    #     # 计算每个颜色通道的直方图
			
 
				+    #     hist_b = cv2.calcHist([img], [0], None, [256], [0, 256])
			
 
				+    #     hist_g = cv2.calcHist([img], [1], None, [256], [0, 256])
			
 
				+    #     hist_r = cv2.calcHist([img], [2], None, [256], [0, 256])
			
 
				+    #
			
 
				+    #     # 将直方图合并成一个数组
			
 
				+    #     hist = np.concatenate((hist_b.flatten(), hist_g.flatten(), hist_r.flatten()))
			
 
				+    #
			
 
				+    #     # 获取非零值的索引及其数量
			
 
				+    #     non_zero_indices = np.nonzero(hist)[0]
			
 
				+    #     counts = hist[non_zero_indices]
			
 
				+    #
			
 
				+    #     # 将索引转换为颜色值
			
 
				+    #     colors = np.unravel_index(non_zero_indices, (256, 256, 256))
			
 
				+    #     colors = np.transpose(colors)
			
 
				+    #
			
 
				+    #     log("count_colors_with_histogram Time taken: " + str(time.time() - time00))
			
 
				+    #     return colors, counts
			
 
				+    #
			
 
				+    #
			
 
				+    # def count_colors_with_kmeans(img):
			
 
				+    #     time00 = time.time()
			
 
				+    #     img_color = img.reshape(-1, 3)
			
 
				+    #
			
 
				+    #     # 使用 KMeans 聚类，将颜色聚类为 16 种
			
 
				+    #     kmeans = KMeans(n_clusters=4, random_state=0, n_init=2, max_iter=10)
			
 
				+    #     kmeans.fit(img_color)
			
 
				+    #
			
 
				+    #     # 获取聚类后的标签和中心
			
 
				+    #     labels = kmeans.labels_
			
 
				+    #     centers = kmeans.cluster_centers_
			
 
				+    #
			
 
				+    #     # 统计每个聚类中心的数量
			
 
				+    #     unique_labels, counts = np.unique(labels, return_counts=True)
			
 
				+    #
			
 
				+    #     print("Time taken: ", time.time() - time00)
			
 
				+    #     return centers[unique_labels], counts
			
 
				+    #
			
 
				+    # def count_colors_with_bincount(img):
			
 
				+    #     time00 = time.time()
			
 
				+    #     img_color = img.reshape(-1, 3)
			
 
				+    #
			
 
				+    #     # 将颜色编码为一个整数
			
 
				+    #     colors_encoded = img_color[:, 0] * 256 * 256 + img_color[:, 1] * 256 + img_color[:, 2]
			
 
				+    #
			
 
				+    #     # 使用 bincount 计算每个颜色的数量
			
 
				+    #     counts = np.bincount(colors_encoded)
			
 
				+    #
			
 
				+    #     # 获取非零值的索引及其数量
			
 
				+    #     non_zero_indices = np.nonzero(counts)[0]
			
 
				+    #
			
 
				+    #     # 解码颜色值
			
 
				+    #     colors_decoded = []
			
 
				+    #     for index in non_zero_indices:
			
 
				+    #         r = (index // (256 * 256)) % 256
			
 
				+    #         g = (index // 256) % 256
			
 
				+    #         b = index % 256
			
 
				+    #         colors_decoded.append([r, g, b])
			
 
				+    #
			
 
				+    #     colors_decoded = np.array(colors_decoded)
			
 
				+    #     counts_non_zero = counts[non_zero_indices]
			
 
				+    #
			
 
				+    #     print("Time taken: ", time.time() - time00)
			
 
				+    #     return colors_decoded, counts_non_zero
			
 
				+
			
 
				+    # 统计每种颜色的出现次数
			
 
				+    # time00 = time.time()
			
 
				+
			
 
				+    # 对图像进行降采样
			
 
				+
			
 
				+    time0 = time.time()
			
 
				+    down_sample_factor = 8
			
 
				+    down_sampled_img = img[::down_sample_factor, ::down_sample_factor, :]
			
 
				+    down_sampled_img_color = down_sampled_img.reshape(-1, 3)
			
 
				+    colors, counts = np.unique(down_sampled_img_color, return_counts=True, axis=0)
			
 
				+    log('shrink_bbox 0 ' + str(time.time()-time0))
			
 
				+
			
 
				+    # 找到出现次数最多的颜色
			
 
				+    time0 = time.time()
			
 
				+    max_count_index = np.argmax(counts)
			
 
				+    most_frequent_color = colors[max_count_index]
			
 
				+    most_frequent_color = most_frequent_color.astype(np.int32)
			
 
				+    log('shrink_bbox 1 ' + str(time.time()-time0))
			
 
				+
			
 
				+    new_bbox_list = []
			
 
				+    img_int = img.astype(np.int32)
			
 
				+    time0 = time.time()
			
 
				+    for bbox in bbox_list:
			
 
				+        # img_bbox = img[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
			
 
				+        # img_bbox = img[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
			
 
				+        img_bbox_int = img_int[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
			
 
				+
			
 
				+        if 0 in img_bbox_int.shape:
			
 
				+            new_bbox_list.append(bbox)
			
 
				+            continue
			
 
				+
			
 
				+        # 左右上下开始扫描，碰到黑像素即停
			
 
				+        # index_list = return_first_black_index(img_bbox[:, :, :])
			
 
				+        index_list = return_not_most_color_index_fast(img_bbox_int, most_frequent_color)
			
 
				+
			
 
				+        if index_list[0].size == 0 or index_list[1].size == 0:
			
 
				+            new_bbox_list.append(bbox)
			
 
				+            continue
			
 
				+        min_h = index_list[0][0]
			
 
				+        max_h = index_list[0][-1]
			
 
				+
			
 
				+        img_bbox1 = np.swapaxes(img_bbox_int, 0, 1)
			
 
				+        # index_list = return_first_black_index(img_bbox1[:, :, :])
			
 
				+        index_list = return_not_most_color_index_fast(img_bbox1, most_frequent_color)
			
 
				+
			
 
				+        if index_list[0].size == 0 or index_list[1].size == 0:
			
 
				+            new_bbox_list.append(bbox)
			
 
				+            continue
			
 
				+        min_w = index_list[0][0]
			
 
				+        max_w = index_list[0][-1]
			
 
				+
			
 
				+        real_min_w = bbox[0] + min_w
			
 
				+        real_max_w = bbox[0] + max_w
			
 
				+        real_min_h = bbox[1] + min_h
			
 
				+        real_max_h = bbox[1] + max_h
			
 
				+        new_bbox = [real_min_w, real_min_h, real_max_w, real_max_h]
			
 
				+        new_bbox_list.append(new_bbox)
			
 
				+
			
 
				+        # cv2.imshow('img', img_bbox)
			
 
				+        # cv2.imshow('shrink', img[int(new_bbox[0][1]):int(new_bbox[2][1]), int(new_bbox[0][0]):int(new_bbox[2][0]), :])
			
 
				+        # cv2.waitKey(0)
			
 
				+    log('shrink_bbox 2 ' + str(time.time() - time0))
			
 
				+    return new_bbox_list
			
 
				+
			
 
				+
			
 
				+def shrink_bbox_by_pixel(lt_text_list):
			
 
				+    for lt_text in lt_text_list:
			
 
				+        bbox = lt_text.bbox
			
 
				+        bbox_h = abs(bbox[3] - bbox[1])
			
 
				+        shrink_h = bbox_h / 2
			
 
				+        new_bbox = [bbox[0], int(bbox[1] + shrink_h / 2),
			
 
				+                    bbox[2], int(bbox[3] - shrink_h / 2)
			
 
				+                    ]
			
 
				+        lt_text.bbox = new_bbox
			
 
				+    return lt_text_list
			
 
				+
			
 
				+
			
 
				+def get_inter_part(bbox_list, show=0):
			
 
				+    if not bbox_list:
			
 
				+        return None
			
 
				+
			
 
				+    # xs = [[x[0], x[2]] for x in bbox_list]
			
 
				+    # xs = [y for x in xs for y in x]
			
 
				+    #
			
 
				+    # ys = [[x[1], x[3]] for x in bbox_list]
			
 
				+    # ys = [y for x in ys for y in x]
			
 
				+    #
			
 
				+    # xs.sort(key=lambda x: x)
			
 
				+    # ys.sort(key=lambda x: x)
			
 
				+    #
			
 
				+    # max_index = len(bbox_list)
			
 
				+    # min_index = max_index - 1
			
 
				+    #
			
 
				+    # min_x, max_x = xs[min_index], xs[max_index]
			
 
				+    # min_y, max_y = ys[min_index], ys[max_index]
			
 
				+
			
 
				+    # min_x, min_y, max_x, max_y = bbox_list[0]
			
 
				+    # for bbox in bbox_list:
			
 
				+    #     # if min_x < bbox[0]:
			
 
				+    #     #     min_x = bbox[0]
			
 
				+    #     # if min_y < bbox[1]:
			
 
				+    #     #     min_y = bbox[1]
			
 
				+    #     # if max_x > bbox[2]:
			
 
				+    #     #     max_x = bbox[2]
			
 
				+    #     # if max_y > bbox[3]:
			
 
				+    #     #     max_y = bbox[3]
			
 
				+    #     if min_x < min(bbox[0], bbox[2]):
			
 
				+    #         min_x = min(bbox[0], bbox[2])
			
 
				+    #     if min_y < min(bbox[1], bbox[3]):
			
 
				+    #         min_y = min(bbox[1], bbox[3])
			
 
				+    #     if max_x > max(bbox[0], bbox[2]):
			
 
				+    #         max_x = max(bbox[0], bbox[2])
			
 
				+    #     if max_y > max(bbox[1], bbox[3]):
			
 
				+    #         max_y = max(bbox[1], bbox[3])
			
 
				+    #     # print('min_x, min_y, max_x, max_y', min_x, min_y, max_x, max_y)
			
 
				+    # _min_x = min(min_x, max_x)
			
 
				+    # _max_x = max(min_x, max_x)
			
 
				+    # _min_y = min(min_y, max_y)
			
 
				+    # _max_y = max(min_y, max_y)
			
 
				+
			
 
				+    # # 同一行的bbox去重，取最大的
			
 
				+    # # used_bbox_list = []
			
 
				+    # current_bbox = bbox_list[0]
			
 
				+    # delete_bbox_list = []
			
 
				+    # bbox_list.sort(key=lambda x: (x[1], x[3]))
			
 
				+    # threshold = 5
			
 
				+    # for bbox in bbox_list:
			
 
				+    #     if bbox == current_bbox:
			
 
				+    #         continue
			
 
				+    #     if current_bbox in delete_bbox_list:
			
 
				+    #         current_bbox = bbox
			
 
				+    #         continue
			
 
				+    #     if current_bbox[1] - threshold <= bbox[1] <= bbox[3] <= current_bbox[3] + threshold:
			
 
				+    #         if abs(current_bbox[0] - current_bbox[2]) > abs(bbox[0] - bbox[2]):
			
 
				+    #             delete_bbox_list.append(bbox)
			
 
				+    #         else:
			
 
				+    #             delete_bbox_list.append(current_bbox)
			
 
				+    #     else:
			
 
				+    #         current_bbox = bbox
			
 
				+    #
			
 
				+    # for bbox in delete_bbox_list:
			
 
				+    #     if bbox in bbox_list:
			
 
				+    #         bbox_list.remove(bbox)
			
 
				+
			
 
				+    bbox_list.sort(key=lambda x: (x[0], x[2]))
			
 
				+    min_x, min_y, max_x, max_y = bbox_list[0]
			
 
				+    for bbox in bbox_list:
			
 
				+        if min_x < bbox[0]:
			
 
				+            min_x = bbox[0]
			
 
				+        if min_y < bbox[1]:
			
 
				+            min_y = bbox[1]
			
 
				+        if max_x > bbox[2]:
			
 
				+            max_x = bbox[2]
			
 
				+        if max_y > bbox[3]:
			
 
				+            max_y = bbox[3]
			
 
				+    _min_x = min(min_x, max_x)
			
 
				+    _max_x = max(min_x, max_x)
			
 
				+    _min_y = min(min_y, max_y)
			
 
				+    _max_y = max(min_y, max_y)
			
 
				+    if show:
			
 
				+        print('get_inter_part', [_min_x, _min_y, _max_x, _max_y])
			
 
				+    return [_min_x, _min_y, _max_x, _max_y]
			
 
				+
			
 
				+
			
 
				+def get_inter_part_250530(bbox_list, show=0):
			
 
				+    if not bbox_list:
			
 
				+        return None
			
 
				+
			
 
				+    x1_list = [x[0] for x in bbox_list]
			
 
				+    x2_list = [x[2] for x in bbox_list]
			
 
				+    y1_list = [x[1] for x in bbox_list]
			
 
				+    y2_list = [x[3] for x in bbox_list]
			
 
				+
			
 
				+    x1_list.sort(key=lambda x: x, reverse=True)
			
 
				+    x2_list.sort(key=lambda x: x)
			
 
				+
			
 
				+
			
 
				+def get_straight_lines_from_image(image_np, threshold=50):
			
 
				+    # 读取图像
			
 
				+    if image_np is None:
			
 
				+        print("无法读取图像")
			
 
				+        return False
			
 
				+
			
 
				+    # 转换为灰度图像
			
 
				+    gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
			
 
				+
			
 
				+    # 使用Canny算子进行边缘检测
			
 
				+    edges = cv2.Canny(gray, 20, 150)
			
 
				+
			
 
				+    cv2.imshow('edges', edges)
			
 
				+
			
 
				+    # 使用霍夫直线变换检测直线
			
 
				+    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold,
			
 
				+                            minLineLength=50, maxLineGap=2)
			
 
				+
			
 
				+    for line in lines:
			
 
				+        line = line[0]
			
 
				+        print('line', line)
			
 
				+        cv2.line(image_np, line[:2], line[2:], (0, 0, 255))
			
 
				+
			
 
				+    cv2.imshow('img', image_np)
			
 
				+    cv2.waitKey(0)
			
 
				+
			
 
				+    print('lines', lines)
			
 
				+
			
 
				+
			
 
				+def get_table_bbox(table):
			
 
				+    x1 = min([y.bbox[0] for x in table for y in x])
			
 
				+    y1 = min([y.bbox[1] for x in table for y in x])
			
 
				+    x2 = max([y.bbox[2] for x in table for y in x])
			
 
				+    y2 = max([y.bbox[3] for x in table for y in x])
			
 
				+    return [x1, y1, x2, y2]
			
 
				+
			
 
				+
			
 
				+@memory_decorator
			
 
				+def merge_intersecting_lists(lists):
			
 
				+    merged_lists = []
			
 
				+    for current_list in lists:
			
 
				+        # 当前列表转换为集合，方便后续操作
			
 
				+        current_set = set(current_list)
			
 
				+        merged = False
			
 
				+        # 遍历已合并的列表，检查是否有交集
			
 
				+        for i in range(len(merged_lists)):
			
 
				+            merged_set = set(merged_lists[i])
			
 
				+            # 如果存在交集
			
 
				+            if current_set & merged_set:
			
 
				+                # 合并两个列表，并去重
			
 
				+                merged_lists[i] = list(merged_set.union(current_set))
			
 
				+                merged = True
			
 
				+                break
			
 
				+        # 如果没有与任何已合并列表交集，则添加为新的合并列表
			
 
				+        if not merged:
			
 
				+            merged_lists.append(current_list.copy())
			
 
				+    return merged_lists
			
 
				+
			
 
				+
			
 
				+def merge_same_bbox(lt_text_list, avg_char_width, show=0):
			
 
				+    from format_convert.convert_tree import TextBox
			
 
				+    for i in range(len(lt_text_list)):
			
 
				+        lt_text1 = lt_text_list[i]
			
 
				+        line1_x = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
			
 
				+        line1_y = ((lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0))
			
 
				+
			
 
				+        for j in range(i+1, len(lt_text_list)):
			
 
				+            lt_text2 = lt_text_list[j]
			
 
				+            # if lt_text1 == lt_text2:
			
 
				+            #     continue
			
 
				+            if lt_text1.bbox[2] >= lt_text2.bbox[0]:
			
 
				+                continue
			
 
				+
			
 
				+            # x轴上不相交
			
 
				+            line2_x = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
			
 
				+            if line_iou(line1_x, line2_x) > 0:
			
 
				+                continue
			
 
				+
			
 
				+            # y轴上iou大于一定值
			
 
				+            line2_y = ((lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0))
			
 
				+            if line_iou(line1_y, line2_y) > 0.9 \
			
 
				+                    and abs(lt_text1.bbox[2] - lt_text2.bbox[0]) < avg_char_width * 5 \
			
 
				+                    and re.search('[:：]', lt_text2.get_text()) \
			
 
				+                    and not re.search('[:：]', lt_text1.get_text()) \
			
 
				+                    and len(lt_text1.get_text()) <= 2:
			
 
				+                new_lt_text = TextBox(text=lt_text1.get_text() + lt_text2.get_text(),
			
 
				+                                      bbox=[lt_text1.bbox[0], min(lt_text1.bbox[1], lt_text2.bbox[1]),
			
 
				+                                            lt_text2.bbox[2], max(lt_text1.bbox[3], lt_text2.bbox[3])
			
 
				+                                            ])
			
 
				+                lt_text_list[i] = new_lt_text
			
 
				+                lt_text_list[j] = new_lt_text
			
 
				+                if show:
			
 
				+                    print('new_lt_text', new_lt_text)
			
 
				+
			
 
				+    lt_text_list = list(set(lt_text_list))
			
 
				+    lt_text_list.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
			
 
				+
			
 
				+    return lt_text_list
			
 
				+
			
 
				+
			
 
				+def sort_by_read_order(lt_text_list, threshold=10):
			
 
				+    if not lt_text_list:
			
 
				+        return lt_text_list
			
 
				+
			
 
				+    # 按 y1 升序排序
			
 
				+    lt_text_list.sort(key=lambda x: x.bbox[1])
			
 
				+
			
 
				+    # 初始化变量
			
 
				+    sorted_lt_text_list = []
			
 
				+    current_row = [lt_text_list[0]]
			
 
				+
			
 
				+    for i in range(1, len(lt_text_list)):
			
 
				+        # 如果当前边界框的 y1 与前一个边界框的 y1 差距小于阈值，认为是同一行
			
 
				+        if abs(lt_text_list[i].bbox[1] - lt_text_list[i - 1].bbox[1]) < threshold:
			
 
				+            current_row.append(lt_text_list[i])
			
 
				+        else:
			
 
				+            # 对当前行按 x1 排序并添加到结果中
			
 
				+            current_row.sort(key=lambda x: x.bbox[0])
			
 
				+            sorted_lt_text_list += current_row
			
 
				+            current_row = [lt_text_list[i]]
			
 
				+
			
 
				+    # 添加最后一行
			
 
				+    current_row.sort(key=lambda x: x.bbox[0])
			
 
				+    sorted_lt_text_list += current_row
			
 
				+    return sorted_lt_text_list
			
 
				+
			
 
				+
			
 
				+def delete_empty_bbox(lt_text_list, show=0):
			
 
				+    temp_list = []
			
 
				+    for lt_text in lt_text_list:
			
 
				+        if lt_text.get_text() in [':', "：", ";", "；"] \
			
 
				+                or re.sub('\s', '', lt_text.get_text()) == "":
			
 
				+            continue
			
 
				+        temp_list.append(lt_text)
			
 
				+    lt_text_list = temp_list
			
 
				+    return lt_text_list
			
 
				+
			
 
				+
			
 
				+def standard_table(table, show=0):
			
 
				+    if not table:
			
 
				+        return table
			
 
				+
			
 
				+    # 去掉占位符
			
 
				+    for ri, row in enumerate(table):
			
 
				+        for ci, col in enumerate(row):
			
 
				+            if '@@:' in col.get('text'):
			
 
				+                col['text'] = re.sub('@@:', '', col.get('text'))
			
 
				+
			
 
				+    # 修复一些表头冒号ocr提取不到被作为值的问题
			
 
				+    for ri, row in enumerate(table):
			
 
				+        if row[0].get('text') == '' and row[1].get('text') != '' and row[2].get('text') != '' and row[3].get('text') == '':
			
 
				+            row[0]['text'] = row[1].get('text')
			
 
				+            row[1]['text'] = ''
			
 
				+            if show:
			
 
				+                print('standard_table, add colon head', table[ri])
			
 
				+
			
 
				+    # 修复表头值上下错位的情况
			
 
				+    # head          head
			
 
				+    #       value           value
			
 
				+    delete_row_index_list = []
			
 
				+    for ri, row in enumerate(table):
			
 
				+        if ri == 0:
			
 
				+            continue
			
 
				+        last_row = table[ri - 1]
			
 
				+        if last_row[0].get('text') != '' and last_row[1].get('text') == '' \
			
 
				+                and row[0].get('text') == '' and row[1].get('text') != '' \
			
 
				+                and last_row[2].get('text') != '' and last_row[3].get('text') == '' \
			
 
				+                and row[2].get('text') == '' and row[3].get('text') != '':
			
 
				+            # 补上表头
			
 
				+            row[0]['text'] = last_row[0].get('text')
			
 
				+            row[2]['text'] = last_row[2].get('text')
			
 
				+            delete_row_index_list.append(ri - 1)
			
 
				+            if show:
			
 
				+                print('standard_table, fix head value 1', table[ri])
			
 
				+
			
 
				+    temp_list = []
			
 
				+    for ri, row in enumerate(table):
			
 
				+        if ri in delete_row_index_list:
			
 
				+            continue
			
 
				+        temp_list.append(row)
			
 
				+    table = temp_list
			
 
				+
			
 
				+    # 修复值未被合进上一行的情况
			
 
				+    # head  value   head    value
			
 
				+    #       value           value
			
 
				+    delete_row_index_list = []
			
 
				+    for ri, row in enumerate(table):
			
 
				+        if ri == 0:
			
 
				+            continue
			
 
				+        last_row = table[ri - 1]
			
 
				+        if last_row[0].get('text') != '' and last_row[1].get('text') != '' \
			
 
				+                and row[0].get('text') == '' and row[1].get('text') != '' \
			
 
				+                and last_row[2].get('text') != '' and last_row[3].get('text') != '' \
			
 
				+                and row[2].get('text') == '' and row[3].get('text') != '':
			
 
				+            # 补上值
			
 
				+            last_row[1]['text'] += row[1]['text']
			
 
				+            last_row[3]['text'] += row[3]['text']
			
 
				+            delete_row_index_list.append(ri)
			
 
				+    temp_list = []
			
 
				+    for ri, row in enumerate(table):
			
 
				+        if ri in delete_row_index_list:
			
 
				+            continue
			
 
				+        temp_list.append(row)
			
 
				+    table = temp_list
			
 
				+    return table
			
 
				+
			
 
				+
			
 
				+@memory_decorator
			
 
				+def find_outline_lt_text(lt_text_list, show=0):
			
 
				+    lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
			
 
				+    used_lt_text_list = []
			
 
				+    row_list = []
			
 
				+    for lt_text1 in lt_text_list:
			
 
				+        if lt_text1 in used_lt_text_list:
			
 
				+            continue
			
 
				+        row = [lt_text1]
			
 
				+        used_lt_text_list.append(lt_text1)
			
 
				+        for lt_text2 in lt_text_list:
			
 
				+            if lt_text2 in used_lt_text_list:
			
 
				+                continue
			
 
				+            line1 = [(lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0)]
			
 
				+            line2 = [(lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0)]
			
 
				+            if line_iou(line1, line2) > 0:
			
 
				+                row.append(lt_text2)
			
 
				+                used_lt_text_list.append(lt_text2)
			
 
				+        row_list.append(row)
			
 
				+
			
 
				+    outline_lt_text_list = []
			
 
				+    for row in row_list:
			
 
				+        if len(row) >= 2:
			
 
				+            continue
			
 
				+        outline_lt_text_list += row
			
 
				+
			
 
				+    if show:
			
 
				+        print('outline_lt_text_list', outline_lt_text_list)
			
 
				+    return outline_lt_text_list
			
 
				+
			
 
				+
			
 
				+def get_iou(bbox1, bbox2):
			
 
				+    # 提取边界框的坐标
			
 
				+    x1_1, y1_1, x2_1, y2_1 = bbox1
			
 
				+    x1_2, y1_2, x2_2, y2_2 = bbox2
			
 
				+
			
 
				+    # 判断是否完全包含
			
 
				+    if (x1_1 <= x1_2 and y1_1 <= y1_2 and x2_1 >= x2_2 and y2_1 >= y2_2) or \
			
 
				+            (x1_2 <= x1_1 and y1_2 <= y1_1 and x2_2 >= x2_1 and y2_2 >= y2_1):
			
 
				+        return 1.0
			
 
				+
			
 
				+    # 计算交集区域的坐标
			
 
				+    inter_x1 = max(x1_1, x1_2)
			
 
				+    inter_y1 = max(y1_1, y1_2)
			
 
				+    inter_x2 = min(x2_1, x2_2)
			
 
				+    inter_y2 = min(y2_1, y2_2)
			
 
				+
			
 
				+    # 计算交集区域的面积
			
 
				+    inter_width = max(0, inter_x2 - inter_x1 + 1)
			
 
				+    inter_height = max(0, inter_y2 - inter_y1 + 1)
			
 
				+    inter_area = inter_width * inter_height
			
 
				+
			
 
				+    # 计算两个边界框的面积
			
 
				+    bbox1_area = (x2_1 - x1_1 + 1) * (y2_1 - y1_1 + 1)
			
 
				+    bbox2_area = (x2_2 - x1_2 + 1) * (y2_2 - y1_2 + 1)
			
 
				+
			
 
				+    # 计算并集区域的面积
			
 
				+    union_area = bbox1_area + bbox2_area - inter_area
			
 
				+
			
 
				+    # 计算 IoU
			
 
				+    iou = inter_area / union_area if union_area != 0 else 0
			
 
				+
			
 
				+    return iou
			
 
				+
			
 
				+
			
 
				+def fix_cross_bbox(lt_text_list, show=0):
			
 
				+    for lt_text1 in lt_text_list:
			
 
				+        for lt_text2 in lt_text_list:
			
 
				+            if lt_text1 == lt_text2:
			
 
				+                continue
			
 
				+            if get_iou(lt_text1.bbox, lt_text2.bbox) > 0:
			
 
				+                if show:
			
 
				+                    print('fix_cross_bbox1', lt_text1, lt_text2)
			
 
				+                x10, x11, x12, x13 = lt_text1.bbox
			
 
				+                x20, x21, x22, x23 = lt_text2.bbox
			
 
				+
			
 
				+                # 右侧相交，且交集不能过大，过大则不是这一维相交
			
 
				+                if x10 < x20 < x12 and x12 - x20 < max(abs(x12 - x10), abs(x20 - x22)) / 2:
			
 
				+                    x12 = min(lt_text1.bbox[2], lt_text2.bbox[0])
			
 
				+                    x20 = max(lt_text1.bbox[2], lt_text2.bbox[0])
			
 
				+
			
 
				+                # 下方相交，且交集不能过大，过大则不是这一维相交
			
 
				+                if x11 < x21 < x13 and x13 - x21 < max(abs(x13 - x11), abs(x21 - x23)) / 2:
			
 
				+                    x13 = min(lt_text1.bbox[3], lt_text2.bbox[1])
			
 
				+                    x21 = max(lt_text1.bbox[3], lt_text2.bbox[1])
			
 
				+
			
 
				+                lt_text1.bbox = [x10, x11, x12, x13]
			
 
				+                lt_text2.bbox = [x20, x21, x22, x23]
			
 
				+                if show:
			
 
				+                    print('fix_cross_bbox2', lt_text1, lt_text2)
			
 
				+    return lt_text_list
			
 
				+
			
 
				+
			
 
				+def split_lt_text_by_many_space(lt_text_list, show=0):
			
 
				+    from format_convert.convert_tree import TextBox
			
 
				+
			
 
				+    # 先处理前后空格
			
 
				+    add_lt_text_list = []
			
 
				+    delete_lt_text_list = []
			
 
				+    for lt_text in lt_text_list:
			
 
				+        text = lt_text.get_text()
			
 
				+        bbox = lt_text.bbox
			
 
				+
			
 
				+        if len(text) == 0:
			
 
				+            continue
			
 
				+        text_unicode_len = get_char_unicode_length(text)
			
 
				+        if text_unicode_len == 0:
			
 
				+            continue
			
 
				+        ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
			
 
				+
			
 
				+        space1 = re.findall('^[ 　]+', text)
			
 
				+        if space1:
			
 
				+            space1 = ''.join(space1)
			
 
				+            space1_unicode_len = get_char_unicode_length(space1)
			
 
				+            space1_pixel_len = space1_unicode_len * ratio
			
 
				+            text = re.sub('^[ 　]+', '', text)
			
 
				+            bbox = [bbox[0] + space1_pixel_len, bbox[1], bbox[2], bbox[3]]
			
 
				+            if len(text) == 0:
			
 
				+                continue
			
 
				+            text_unicode_len = get_char_unicode_length(text)
			
 
				+            if text_unicode_len == 0:
			
 
				+                continue
			
 
				+            ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
			
 
				+
			
 
				+        space2 = re.findall('[ 　]+$', text)
			
 
				+        if space2:
			
 
				+            space2 = ''.join(space2)
			
 
				+            space2_unicode_len = get_char_unicode_length(space2)
			
 
				+            space2_pixel_len = space2_unicode_len * ratio
			
 
				+            text = re.sub('[ 　]+$', '', text)
			
 
				+            bbox = [bbox[0], bbox[1], bbox[2] - space2_pixel_len, bbox[3]]
			
 
				+            if len(text) == 0:
			
 
				+                continue
			
 
				+            text_unicode_len = get_char_unicode_length(text)
			
 
				+            if text_unicode_len == 0:
			
 
				+                continue
			
 
				+            ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
			
 
				+
			
 
				+        if space1 or space2:
			
 
				+            new_lt_text = TextBox(text=text, bbox=bbox)
			
 
				+            add_lt_text_list.append(new_lt_text)
			
 
				+            delete_lt_text_list.append(lt_text)
			
 
				+
			
 
				+    for lt_text in delete_lt_text_list:
			
 
				+        if lt_text in lt_text_list:
			
 
				+            lt_text_list.remove(lt_text)
			
 
				+    lt_text_list += add_lt_text_list
			
 
				+
			
 
				+    # 处理表头中间隔着几个空格 电  话：        电  话：
			
 
				+    add_lt_text_list = []
			
 
				+    delete_lt_text_list = []
			
 
				+    for lt_text in lt_text_list:
			
 
				+        text = lt_text.get_text()
			
 
				+        bbox = lt_text.bbox
			
 
				+
			
 
				+        if len(text) == 0:
			
 
				+            continue
			
 
				+
			
 
				+        space_list = re.findall('[ 　]+', text)
			
 
				+        if len(space_list) >= 2:
			
 
				+            space_list.sort(key=lambda x: len(x))
			
 
				+            max_space = space_list[-1]
			
 
				+            match = re.search(max_space, text)
			
 
				+            if show:
			
 
				+                print('max_space', max_space)
			
 
				+                print('space_list', space_list)
			
 
				+            if match:
			
 
				+                part1 = text[:match.start()]
			
 
				+                part2 = text[match.end():]
			
 
				+                ss1 = re.split('[ 　]+', part1)
			
 
				+                ss2 = re.split('[ 　]+', part2)
			
 
				+
			
 
				+                if len(ss1) == 2 and len(ss1[0]) == 1 and len(ss1[1]) == 2 and ss1[1][-1] in [':', '：'] \
			
 
				+                        and len(ss2) == 2 and len(ss2[0]) == 1 and len(ss2[1]) == 2 and ss2[1][-1] in [':', '：']:
			
 
				+                    new_text = ''.join(ss1) + max_space + ''.join(ss2)
			
 
				+                    new_lt_text = TextBox(text=new_text, bbox=bbox)
			
 
				+                    add_lt_text_list.append(new_lt_text)
			
 
				+                    delete_lt_text_list.append(lt_text)
			
 
				+
			
 
				+    if show:
			
 
				+        print('split_lt_text_by_many_space add_lt_text_list222', add_lt_text_list)
			
 
				+        print('split_lt_text_by_many_space delete_lt_text_list222', delete_lt_text_list)
			
 
				+
			
 
				+    for lt_text in delete_lt_text_list:
			
 
				+        if lt_text in lt_text_list:
			
 
				+            lt_text_list.remove(lt_text)
			
 
				+    lt_text_list += add_lt_text_list
			
 
				+
			
 
				+    # 处理中间多个空格，并拆分为两个
			
 
				+    add_lt_text_list = []
			
 
				+    delete_lt_text_list = []
			
 
				+    for lt_text in lt_text_list:
			
 
				+        text = lt_text.get_text()
			
 
				+        bbox = lt_text.bbox
			
 
				+
			
 
				+        if len(text) == 0:
			
 
				+            continue
			
 
				+
			
 
				+        text_unicode_len = get_char_unicode_length(text)
			
 
				+        if text_unicode_len == 0:
			
 
				+            continue
			
 
				+        ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
			
 
				+
			
 
				+        # 中间有多个空格，且空格分割为两部分
			
 
				+        match = re.search('[ 　]{4,}', text)
			
 
				+        ss = re.split('[ 　]+', text)
			
 
				+        if match and len(ss) == 2:
			
 
				+            # if match:
			
 
				+            part1 = text[:match.start()]
			
 
				+            part2 = text[match.end():]
			
 
				+
			
 
				+            l1 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part1)
			
 
				+            l2 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part2)
			
 
				+            # 两边字符数都足够
			
 
				+            if len(l1) >= 2 and len(l2) >= 2:
			
 
				+                part1_unicode_len = get_char_unicode_length(part1)
			
 
				+                part2_unicode_len = get_char_unicode_length(part2)
			
 
				+
			
 
				+                part1_pixel_len = ratio * part1_unicode_len
			
 
				+                part2_pixel_len = ratio * part2_unicode_len
			
 
				+
			
 
				+                # avg_char_w = abs(bbox[0] - bbox[2]) / len(text)
			
 
				+                bbox1 = [bbox[0], bbox[1], bbox[0] + part1_pixel_len, bbox[3]]
			
 
				+                bbox2 = [bbox[2] - part2_pixel_len, bbox[1], bbox[2], bbox[3]]
			
 
				+                # 用自己的对象新增
			
 
				+                new_lt_text1 = TextBox(text=part1, bbox=bbox1)
			
 
				+                new_lt_text2 = TextBox(text=part2, bbox=bbox2)
			
 
				+                add_lt_text_list += [new_lt_text1, new_lt_text2]
			
 
				+                delete_lt_text_list.append(lt_text)
			
 
				+
			
 
				+    for lt_text in delete_lt_text_list:
			
 
				+        if lt_text in lt_text_list:
			
 
				+            lt_text_list.remove(lt_text)
			
 
				+    lt_text_list += add_lt_text_list
			
 
				+
			
 
				+    if show:
			
 
				+        print('split_lt_text_by_many_space add_lt_text_list333', add_lt_text_list)
			
 
				+        print('split_lt_text_by_many_space delete_lt_text_list333', delete_lt_text_list)
			
 
				+
			
 
				+    return lt_text_list
			
 
				+
			
 
				+
			
 
				+def get_char_unicode_length(text, show=0):
			
 
				+    # char_reg_len_dict = {
			
 
				+    #     '[ ]': 1,
			
 
				+    #     '[　]': 1.5,
			
 
				+    #     '[\u4e00-\u9fff]': 1.5,
			
 
				+    #     '[a-zA-Z0-9#@,^.+=\(\)<>\-@#$%&*\[\]\'":;?~!’‘“”{}/]': 1,
			
 
				+    #     '[：，。！￥……（）【】；？《》、]': 1.5
			
 
				+    # }
			
 
				+    #
			
 
				+    # text_real_len = 0
			
 
				+    # for reg, char_len in char_reg_len_dict.items():
			
 
				+    #     cs = re.findall(reg, text)
			
 
				+    #     text_real_len += len(cs) * char_len
			
 
				+    #
			
 
				+    # real_avg_char_len = abs(bbox[2] - bbox[0]) / text_real_len
			
 
				+    #
			
 
				+    # char_reg_real_len_dict = {}
			
 
				+    # for reg, char_len in char_reg_len_dict.items():
			
 
				+    #     char_reg_real_len_dict[reg] = real_avg_char_len * char_len
			
 
				+    #
			
 
				+    # return char_reg_real_len_dict
			
 
				+
			
 
				+    width = wcwidth.wcswidth(text)
			
 
				+    if show:
			
 
				+        print('text unicode_length', text, width)
			
 
				+    return width
			
 
				+
			
 
				+
			
 
				+def fix_final_row(table, show=0):
			
 
				+    # print('fix_final_row table', table)
			
 
				+    if len(table) < 2:
			
 
				+        return table
			
 
				+    last_row = table[-2]
			
 
				+    final_row = table[-1]
			
 
				+    print('final_row', final_row)
			
 
				+    print('last_row', last_row)
			
 
				+    delete_final_flag = 0
			
 
				+    if final_row[0] in ['', '@@:'] and final_row[1] in ['', '@@:'] \
			
 
				+            and final_row[2] in ['', '@@:'] and final_row[3] not in ['', '@@:']:
			
 
				+        table[-2][3] = final_row[3]
			
 
				+        delete_final_flag = 1
			
 
				+        if show:
			
 
				+            print('fix_final_row right', table[-2])
			
 
				+
			
 
				+    if final_row[0] in ['', '@@:'] and final_row[1] not in ['', '@@:'] \
			
 
				+            and final_row[2] in ['', '@@:'] and final_row[3] in ['', '@@:']:
			
 
				+        table[-2][1] = final_row[1]
			
 
				+        delete_final_flag = 1
			
 
				+        if show:
			
 
				+            print('fix_final_row left', table[-2])
			
 
				+
			
 
				+    if delete_final_flag:
			
 
				+        table = table[:-1]
			
 
				+
			
 
				+    return table
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # from format_convert.convert_pdf import PDFConvert
			
 
				+    # pdf_c = PDFConvert(None, None, None)
			
 
				+    # from format_convert.convert_image import ImageProcess
			
 
				+    # img_p = ImageProcess(None, None)
			
 
				+    #
			
 
				+    # ps = glob(r'D:\Project\format_conversion_maxcompute\save_b_table_not_detect\*')
			
 
				+    # image_np_list = [[x, cv2.imread(x)] for x in ps]
			
 
				+    # for p, image_np in image_np_list:
			
 
				+    #     # 整体分辨率限制
			
 
				+    #     image_np = img_p.resize_process(image_np)
			
 
				+    #     # 文字识别
			
 
				+    #     text_list, box_list = img_p.ocr_process(image_np)
			
 
				+    #     # 转换为lt_text_box
			
 
				+    #     _lt_text_list = text_bbox_to_lt(text_list, box_list)
			
 
				+    # 先bbox预先判断可能有无边框
			
 
				+    # _flag = judge_has_b_table_by_bbox(_lt_text_list, [], 0)
			
 
				+    # print('path', p, 'has b table', _flag)
			
 
				+
			
 
				+    _pp = r'D:\Project\format_conversion_maxcompute\save_b_table\15-8292f767be81f404b813c119058a8a75.png'
			
 
				+    img111 = cv2.imread(_pp)
			
 
				+    img111 = pil_resize(img111, 1024, 768)
			
 
				+    get_straight_lines_from_image(img111)
			
 
				+    pass
			
--- a/botr/utils.py
+++ b/botr/utils.py
@@ -38,6 +38,11 @@ def request_post(url, param, time_out=1000, use_zlib=False):
 
				 
			
 
				 
			
 
				 def line_iou(line1, line2, axis=0):
			
 
				+    if line1[0][axis] <= line2[0][axis] <= line2[1][axis] <= line1[1][axis]:
			
 
				+        return 1.
			
 
				+    if line2[0][axis] <= line1[0][axis] <= line1[1][axis] <= line2[1][axis]:
			
 
				+        return 1.
			
 
				+
			
 
				     inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
			
 
				     # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
			
 
				     union = min(abs(line1[0][axis]-line1[1][axis]), abs(line2[0][axis]-line2[1][axis]))
			
--- a/config/interface_new.yml
+++ b/config/interface_new.yml
@@ -58,7 +58,7 @@
 
				 
			
 
				     "tika": {
			
 
				       "port": [ 16020 ],
			
 
				-      "port_num": [ 2 ],
			
 
				+      "port_num": [ 1 ],
			
 
				       "gpu": [ -1 ]
			
 
				     }
			
 
				   },
			
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -1,4 +1,4 @@
 
				-#-*- coding: utf-8 -*-
			
 
				+# -*- coding: utf-8 -*-
			
 
				 import gc
			
 
				 import json
			
 
				 import sys
			
@@ -6,8 +6,20 @@ import os
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
 
				 # 强制tf使用cpu
			
 
				 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
			
 
				+
			
 
				+# 动态添加 VERSION 属性到 Image 类
			
 
				+import PIL
			
 
				+from PIL import Image
			
 
				+Image.VERSION = PIL.__version__
			
 
				+
			
 
				 from format_convert.utils import judge_error_code, request_post, get_intranet_ip, get_ip_port, get_logger, log, \
			
 
				-    set_flask_global, get_md5_from_bytes, memory_decorator
			
 
				+    set_flask_global, get_md5_from_bytes, memory_decorator, register_all_fonts
			
 
				+
			
 
				+# 调用函数注册字体
			
 
				+# register_all_fonts("/usr/share/fonts/opentype/noto/")
			
 
				+# register_all_fonts("/usr/share/fonts/truetype/arphic")
			
 
				+# register_all_fonts("/usr/share/fonts/")
			
 
				+
			
 
				 from format_convert.convert_doc import doc2text, DocConvert
			
 
				 from format_convert.convert_docx import docx2text, DocxConvert
			
 
				 from format_convert.convert_image import picture2text, ImageConvert
			
@@ -18,6 +30,8 @@ from format_convert.convert_txt import txt2text, TxtConvert
 
				 from format_convert.convert_xls import xls2text, XlsConvert
			
 
				 from format_convert.convert_xlsx import xlsx2text, XlsxConvert
			
 
				 from format_convert.convert_zip import zip2text, ZipConvert
			
 
				+from format_convert.convert_wps import WpsConvert
			
 
				+from format_convert.convert_ofd import OfdConvert
			
 
				 from format_convert.convert_need_interface import from_atc_interface
			
 
				 
			
 
				 import hashlib
			
@@ -33,12 +47,28 @@ import logging
 
				 from bs4 import BeautifulSoup
			
 
				 from flask import Flask, request, g
			
 
				 import inspect
			
 
				+
			
 
				 logging.getLogger("pdfminer").setLevel(logging.WARNING)
			
 
				 from format_convert.table_correct import *
			
 
				 from format_convert.wrapt_timeout_decorator import *
			
 
				 from format_convert import _global
			
 
				 from config.max_compute_config import MAX_COMPUTE
			
 
				 
			
 
				+support_file_types = [
			
 
				+    'txt',
			
 
				+    'pdf',
			
 
				+    'doc',
			
 
				+    'docx',
			
 
				+    'xls',
			
 
				+    'xlsx',
			
 
				+    'zip',
			
 
				+    'rar',
			
 
				+    'jpg',
			
 
				+    'png',
			
 
				+    'jpeg',
			
 
				+    'swf',
			
 
				+    'wps',
			
 
				+]
			
 
				 
			
 
				 if get_platform() == "Windows":
			
 
				     globals().update({"time_out": 1000})
			
@@ -64,6 +94,9 @@ def getText(_type, path_or_stream, _page_no=None, time_out=300):
 
				     except:
			
 
				         unique_type_dir = path_or_stream + "_" + _type + os.sep
			
 
				 
			
 
				+    if not os.path.exists(unique_type_dir):
			
 
				+        os.mkdir(unique_type_dir)
			
 
				+
			
 
				     if _type == "pdf":
			
 
				         if MAX_COMPUTE:
			
 
				             return PDFConvert(path_or_stream, unique_type_dir, _page_no).get_html()
			
@@ -102,11 +135,19 @@ def getText(_type, path_or_stream, _page_no=None, time_out=300):
 
				         if MAX_COMPUTE:
			
 
				             return TxtConvert(path_or_stream, unique_type_dir).get_html()
			
 
				         return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
			
 
				+    if _type == "wps":
			
 
				+        if MAX_COMPUTE:
			
 
				+            return WpsConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        return get_html_1(WpsConvert(path_or_stream, unique_type_dir))
			
 
				+    if _type == "ofd":
			
 
				+        if MAX_COMPUTE:
			
 
				+            return OfdConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        return get_html_1(OfdConvert(path_or_stream, unique_type_dir))
			
 
				     return [""]
			
 
				 
			
 
				 
			
 
				 def to_html(path, text):
			
 
				-    with open(path, 'w',encoding="utf8") as f:
			
 
				+    with open(path, 'w', encoding="utf8") as f:
			
 
				         f.write("<!DOCTYPE HTML>")
			
 
				         f.write('<head><meta charset="UTF-8"></head>')
			
 
				         f.write("<body>")
			
@@ -154,6 +195,11 @@ def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300, save_m
 
				     if get_platform() == "Windows":
			
 
				         _global._init()
			
 
				 
			
 
				+    if MAX_COMPUTE:
			
 
				+        _path = "/home/admin"
			
 
				+    else:
			
 
				+        _path = os.path.dirname(os.path.abspath(__file__))
			
 
				+
			
 
				     globals().update({"md5": _md5})
			
 
				     _global.update({"md5": _md5})
			
 
				     log("into unique_temp_file_process")
			
@@ -247,7 +293,7 @@ def cut_str(text_list, only_text_list, max_bytes_length=2000000):
 
				             return only_text_list
			
 
				 
			
 
				         # 截取字符
			
 
				-        all_text = all_text[:int(max_bytes_length/3)]
			
 
				+        all_text = all_text[:int(max_bytes_length / 3)]
			
 
				         return [all_text]
			
 
				     except Exception as e:
			
 
				         log("cut_str " + str(e))
			
@@ -336,7 +382,7 @@ def convert_maxcompute(data, ocr_model, otr_model):
 
				             print({"md5: ": str(_md5), "finished result": ["", 0], "is_success": 1}, time.time() - start_time)
			
 
				         else:
			
 
				             print("md5: " + str(_md5), {"finished result": [str(only_text)[:20], len(str(text))],
			
 
				-                  "is_success": 1}, time.time() - start_time)
			
 
				+                                        "is_success": 1}, time.time() - start_time)
			
 
				         return {"result_html": text, "result_text": only_text, "is_success": 1}
			
 
				     except Exception as e:
			
 
				         print({"md5: ": str(_md5), "failed result": [-1], "is_success": 0}, time.time() - start_time)
			
@@ -350,6 +396,20 @@ app = Flask(__name__)
 
				 
			
 
				 @app.route('/convert', methods=['POST'])
			
 
				 def _convert():
			
 
				+    try:
			
 
				+        data = request.form
			
 
				+    except Exception:
			
 
				+        log_convert_result("1" + "0" * 15, [-1], "", 0,
			
 
				+                           None, None, time.time())
			
 
				+        traceback.print_exc()
			
 
				+        return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
			
 
				+                           "is_success": 0, "swf_images": str([]),
			
 
				+                           "classification": ""})
			
 
				+    result = convert(data)
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def _convert_old_250613():
			
 
				     """
			
 
				     接口返回值：
			
 
				     {[str], 1}: 处理成功
			
@@ -377,11 +437,11 @@ def _convert():
 
				     # snapshot = tracemalloc.take_snapshot()
			
 
				 
			
 
				     _global._init()
			
 
				-    _global.update({"md5": "1"+"0"*15})
			
 
				+    _global.update({"md5": "1" + "0" * 15})
			
 
				     set_flask_global()
			
 
				     # _global.update({"port": str(port)})
			
 
				 
			
 
				-    log("into convert")
			
 
				+    log("into _convert")
			
 
				     start_time = time.time()
			
 
				     _md5 = _global.get("md5")
			
 
				     _type = None
			
@@ -395,12 +455,12 @@ def _convert():
 
				         file_path = data.get("file_path")
			
 
				         if file_path is None:
			
 
				             stream = base64.b64decode(data.get("file"))
			
 
				-            log("get bytes from file " + str(time.time()-_time))
			
 
				+            log("get bytes from file " + str(time.time() - _time))
			
 
				         # 有路径则直接取路径打开文件
			
 
				         else:
			
 
				             with open(file_path, "rb") as f:
			
 
				                 stream = f.read()
			
 
				-            log("get bytes from file_path " + str(time.time()-_time))
			
 
				+            log("get bytes from file_path " + str(time.time() - _time))
			
 
				         _type = data.get("type")
			
 
				         _md5 = get_md5_from_bytes(stream)
			
 
				         _md5 = _md5[0]
			
@@ -427,7 +487,8 @@ def _convert():
 
				             # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
			
 
				             # text, swf_images = origin_unique_temp_file_process(stream, _type)
			
 
				             try:
			
 
				-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle)
			
 
				+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
			
 
				+                                                            time_out=globals().get('time_out'), save_middle=save_middle)
			
 
				             except TimeoutError:
			
 
				                 log("convert time out! 300 sec")
			
 
				                 text = [-5]
			
@@ -435,7 +496,8 @@ def _convert():
 
				         else:
			
 
				             # Linux 通过装饰器设置整个转换超时时间
			
 
				             try:
			
 
				-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle)
			
 
				+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
			
 
				+                                                            time_out=globals().get('time_out'), save_middle=save_middle)
			
 
				             except TimeoutError:
			
 
				                 log("convert time out! 300 sec")
			
 
				                 text = [-5]
			
@@ -447,11 +509,12 @@ def _convert():
 
				                 is_success = 1
			
 
				             else:
			
 
				                 is_success = 0
			
 
				-            log("md5: " + str(_md5)
			
 
				-                         + " finished result: " + str(text)
			
 
				-                         + " is_success: " + str(is_success) + " "
			
 
				-                         + str(_type) + " "
			
 
				-                         + " " + str(time.time() - start_time))
			
 
				+            log("md5: " + str(_md5) + " "
			
 
				+                + "finished result: " + str(text) + " "
			
 
				+                + "is_success: " + str(is_success) + " "
			
 
				+                + str(_type) + " "
			
 
				+                + 'None '
			
 
				+                + str(round(time.time() - start_time, 2)))
			
 
				             return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
			
 
				                                "is_success": is_success, "swf_images": str(swf_images)})
			
 
				 
			
@@ -484,16 +547,17 @@ def _convert():
 
				         if only_text[0] == '' and len(only_text) <= 1:
			
 
				             print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
			
 
				             log("md5: " + str(_md5) + " "
			
 
				-                + " finished result: ['', 0] is_success: 1 "
			
 
				+                + "finished result: ['', 0] is_success: 1 "
			
 
				                 + str(_type) + " "
			
 
				-                + str(time.time() - start_time))
			
 
				+                + 'None '
			
 
				+                + str(round(time.time() - start_time, 2)))
			
 
				         else:
			
 
				-            log("md5: " + str(_md5) +
			
 
				-                " finished result: " + str(only_text)[:20] + " "
			
 
				+            log("md5: " + str(_md5) + " "
			
 
				+                + "finished result: " + str(only_text)[:20] + " "
			
 
				                 + str(len(str(text))) + " is_success: 1 "
			
 
				                 + str(_type) + " "
			
 
				                 + str(classification) + " "
			
 
				-                + str(time.time() - start_time))
			
 
				+                + str(round(time.time() - start_time, 2)))
			
 
				 
			
 
				         # log("growth end" + str(objgraph.growth()))
			
 
				         # log("most_common_types end" + str(objgraph.most_common_types(20)))
			
@@ -502,15 +566,24 @@ def _convert():
 
				                            "classification": classification})
			
 
				 
			
 
				     except ConnectionError:
			
 
				-        log("convert post has no data!" + " failed result: [-2] is_success: 0 "
			
 
				-            + str(time.time() - start_time))
			
 
				+        # log("convert post has no data!" + " failed result: [-2] is_success: 0 "
			
 
				+        #     + str(round(time.time() - start_time, 2)))
			
 
				+        log("md5: " + str(_md5) + " "
			
 
				+            + "failed result: [-2] is_success: 0 "
			
 
				+            + str(_type) + " "
			
 
				+            + "None "
			
 
				+            + str(round(time.time() - start_time, 2))
			
 
				+            )
			
 
				         return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
			
 
				                            "is_success": 0, "swf_images": str([]),
			
 
				                            "classification": ""})
			
 
				     except Exception as e:
			
 
				-        log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
			
 
				-            + str(_type) + " " +
			
 
				-            str(time.time() - start_time))
			
 
				+        log("md5: " + str(_md5) + " "
			
 
				+            + "failed result: [-1] is_success: 0 "
			
 
				+            + str(_type) + " "
			
 
				+            + "None "
			
 
				+            + str(round(time.time() - start_time, 2))
			
 
				+            )
			
 
				         traceback.print_exc()
			
 
				         return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
			
 
				                            "is_success": 0, "swf_images": str([]),
			
@@ -545,6 +618,146 @@ def _convert():
 
				 
			
 
				 
			
 
				 def convert(data):
			
 
				+    """
			
 
				+    接口返回值：
			
 
				+    :return: {"result_html": [str], "result_text": [str],
			
 
				+              "is_success": int, "swf_images": str(list)}
			
 
				+    """
			
 
				+    log("into convert")
			
 
				+    start_time = time.time()
			
 
				+
			
 
				+    # 初始化
			
 
				+    _global._init()
			
 
				+    _global.update({"md5": "1" + "0" * 15})
			
 
				+    set_flask_global()
			
 
				+    # 文件md5
			
 
				+    _md5 = _global.get("md5")
			
 
				+    # 文件类型
			
 
				+    _type = None
			
 
				+    try:
			
 
				+        if not data:
			
 
				+            log("convert no data!")
			
 
				+            raise ConnectionError
			
 
				+
			
 
				+        file_path = data.get("file_path")
			
 
				+        if file_path is None:
			
 
				+            stream = base64.b64decode(data.get("file"))
			
 
				+            log("get bytes from file " + str(time.time() - start_time))
			
 
				+        # 有路径则直接取路径打开文件
			
 
				+        else:
			
 
				+            with open(file_path, "rb") as f:
			
 
				+                stream = f.read()
			
 
				+            log("get bytes from file_path " + str(time.time() - start_time))
			
 
				+
			
 
				+        # 获取真实值
			
 
				+        _type = data.get("type")
			
 
				+        _md5 = get_md5_from_bytes(stream)
			
 
				+        _md5 = _md5[0]
			
 
				+        _global.update({"md5": _md5})
			
 
				+
			
 
				+        # 指定页码范围
			
 
				+        _page_no = data.get('page_no')
			
 
				+
			
 
				+        # 指定timeout
			
 
				+        _timeout = data.get('timeout')
			
 
				+        if _timeout is not None:
			
 
				+            globals().update({"time_out": _timeout})
			
 
				+
			
 
				+        # 是否保留中间文件
			
 
				+        save_middle = data.get('save_middle')
			
 
				+
			
 
				+        # 最终结果截取的最大字节数
			
 
				+        max_bytes = data.get("max_bytes")
			
 
				+
			
 
				+        # 开始转换，并且控制时间
			
 
				+        try:
			
 
				+            text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
			
 
				+                                                        time_out=globals().get('time_out'), save_middle=save_middle)
			
 
				+        except TimeoutError:
			
 
				+            log("convert time out! 300 sec")
			
 
				+            text = [-5]
			
 
				+            swf_images = []
			
 
				+
			
 
				+        # 报错依然成功的
			
 
				+        still_success_code = [-3, -4, -7]
			
 
				+        if judge_error_code(text):
			
 
				+            if judge_error_code(text, still_success_code):
			
 
				+                is_success = 1
			
 
				+            else:
			
 
				+                is_success = 0
			
 
				+            log_convert_result(_md5, text, "", is_success,
			
 
				+                               _type, None, start_time)
			
 
				+            return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
			
 
				+                               "is_success": is_success, "swf_images": str(swf_images)})
			
 
				+
			
 
				+        # 结果保存result.html
			
 
				+        text_str = ""
			
 
				+        for t in text:
			
 
				+            text_str += t
			
 
				+        to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
			
 
				+
			
 
				+        # 取纯文本
			
 
				+        only_text = []
			
 
				+        for t in text:
			
 
				+            new_t = BeautifulSoup(t, "lxml").get_text()
			
 
				+            new_t = re.sub("\n", "", new_t)
			
 
				+            only_text.append(new_t)
			
 
				+
			
 
				+        # 判断附件类型
			
 
				+        classification = from_atc_interface(' '.join(only_text))
			
 
				+        if judge_error_code(classification):
			
 
				+            classification = [str(classification[0])]
			
 
				+
			
 
				+        # 判断长度，过长截取
			
 
				+        text = cut_str(text, only_text, max_bytes)
			
 
				+        only_text = cut_str(only_text, only_text)
			
 
				+
			
 
				+        if len(only_text) == 0:
			
 
				+            only_text = [""]
			
 
				+
			
 
				+        if only_text[0] == '' and len(only_text) <= 1:
			
 
				+            log_convert_result(_md5, '', '', 1,
			
 
				+                               _type, None, start_time)
			
 
				+        else:
			
 
				+            log_convert_result(_md5, only_text, text, 1,
			
 
				+                               _type, classification, start_time)
			
 
				+        return json.dumps({"result_html": text, "result_text": only_text,
			
 
				+                           "is_success": 1, "swf_images": str(swf_images),
			
 
				+                           "classification": classification})
			
 
				+
			
 
				+    except ConnectionError:
			
 
				+        log_convert_result(_md5, [-2], "", 0,
			
 
				+                           _type, None, start_time)
			
 
				+        return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
			
 
				+                           "is_success": 0, "swf_images": str([]),
			
 
				+                           "classification": ""})
			
 
				+    except Exception:
			
 
				+        log_convert_result(_md5, [-1], "", 0,
			
 
				+                           _type, None, start_time)
			
 
				+        traceback.print_exc()
			
 
				+        return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
			
 
				+                           "is_success": 0, "swf_images": str([]),
			
 
				+                           "classification": ""})
			
 
				+    finally:
			
 
				+        pass
			
 
				+        # log("finally")
			
 
				+
			
 
				+
			
 
				+def log_convert_result(_md5, only_text, text, is_success, _type, _attach_class, start_time):
			
 
				+    str_list = [
			
 
				+        "md5: " + str(_md5),
			
 
				+        "finished result: " + re.sub(' ', '', str(only_text)[:20]),
			
 
				+        str(len(str(text))),
			
 
				+        "is_success: " + str(is_success),
			
 
				+        str(_type),
			
 
				+        str(_attach_class),
			
 
				+        str(round(time.time()-start_time, 3)),
			
 
				+    ]
			
 
				+    info = ' '.join(str_list)
			
 
				+    log(info)
			
 
				+
			
 
				+
			
 
				+def convert_old_250613(data):
			
 
				     """
			
 
				     接口返回值：
			
 
				     {[str], 1}: 处理成功
			
@@ -558,7 +771,7 @@ def convert(data):
 
				     :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
			
 
				     """
			
 
				     _global._init()
			
 
				-    _global.update({"md5": "1"+"0"*15})
			
 
				+    _global.update({"md5": "1" + "0" * 15})
			
 
				     set_flask_global()
			
 
				 
			
 
				     log("into convert")
			
@@ -584,7 +797,8 @@ def convert(data):
 
				             # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
			
 
				             # text, swf_images = origin_unique_temp_file_process(stream, _type)
			
 
				             try:
			
 
				-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
			
 
				+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
			
 
				+                                                            time_out=globals().get('time_out'))
			
 
				             except TimeoutError:
			
 
				                 log("convert time out! 300 sec")
			
 
				                 text = [-5]
			
@@ -592,7 +806,8 @@ def convert(data):
 
				         else:
			
 
				             # Linux 通过装饰器设置整个转换超时时间
			
 
				             try:
			
 
				-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
			
 
				+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
			
 
				+                                                            time_out=globals().get('time_out'))
			
 
				             except TimeoutError:
			
 
				                 log("convert time out! 300 sec")
			
 
				                 text = [-5]
			
@@ -604,11 +819,12 @@ def convert(data):
 
				                 is_success = 1
			
 
				             else:
			
 
				                 is_success = 0
			
 
				-            log("md5: " + str(_md5)
			
 
				-                + " finished result: " + str(text)
			
 
				-                + " is_success: " + str(is_success) + " "
			
 
				+            log("md5: " + str(_md5) + " "
			
 
				+                + "finished result: " + str(text) + " "
			
 
				+                + "is_success: " + str(is_success) + " "
			
 
				                 + str(_type) + " "
			
 
				-                + " " + str(time.time() - start_time))
			
 
				+                + "None "
			
 
				+                + str(round(time.time() - start_time, 2)))
			
 
				             return {"result_html": [str(text[0])], "result_text": [str(text[0])],
			
 
				                     "is_success": is_success, "swf_images": str(swf_images)}
			
 
				 
			
@@ -639,18 +855,19 @@ def convert(data):
 
				             only_text = [""]
			
 
				 
			
 
				         if only_text[0] == '' and len(only_text) <= 1:
			
 
				-            print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
			
 
				+            # print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
			
 
				             log("md5: " + str(_md5) + " "
			
 
				-                + " finished result: ['', 0] is_success: 1 "
			
 
				+                + "finished result: ['', 0] is_success: 1 "
			
 
				                 + str(_type) + " "
			
 
				-                + str(time.time() - start_time))
			
 
				+                + "None "
			
 
				+                + str(round(time.time() - start_time, 2)))
			
 
				         else:
			
 
				-            log("md5: " + str(_md5) +
			
 
				-                " finished result: " + str(only_text)[:20] + " "
			
 
				+            log("md5: " + str(_md5) + " "
			
 
				+                + "finished result: " + str(only_text)[:20] + " "
			
 
				                 + str(len(str(text))) + " is_success: 1 "
			
 
				                 + str(_type) + " "
			
 
				                 + str(classification) + " "
			
 
				-                + str(time.time() - start_time))
			
 
				+                + str(round(time.time() - start_time, 2)))
			
 
				 
			
 
				         return {"result_html": text, "result_text": only_text,
			
 
				                 "is_success": 1, "swf_images": str(swf_images),
			
@@ -658,7 +875,7 @@ def convert(data):
 
				 
			
 
				     except ConnectionError:
			
 
				         log("convert post has no data!" + " failed result: [-2] is_success: 0 "
			
 
				-            + str(time.time() - start_time))
			
 
				+            + str(round(time.time() - start_time, 2)))
			
 
				         return {"result_html": ["-2"], "result_text": ["-2"],
			
 
				                 "is_success": 0, "swf_images": str([]),
			
 
				                 "classification": ""}
			
@@ -689,7 +906,7 @@ def convert_old(data, ocr_model, otr_model):
 
				     """
			
 
				     log("into convert")
			
 
				     _global._init()
			
 
				-    _global.update({"md5": "1"+"0"*15})
			
 
				+    _global.update({"md5": "1" + "0" * 15})
			
 
				     # set_flask_global()
			
 
				 
			
 
				     start_time = time.time()
			
@@ -706,7 +923,7 @@ def convert_old(data, ocr_model, otr_model):
 
				         _md5 = get_md5_from_bytes(stream)
			
 
				         _md5 = _md5[0]
			
 
				         _global.update({"md5": _md5})
			
 
				-        log("get bytes from file " + str(time.time()-_time))
			
 
				+        log("get bytes from file " + str(time.time() - _time))
			
 
				 
			
 
				         if get_platform() == "Windows":
			
 
				             try:
			
@@ -730,11 +947,12 @@ def convert_old(data, ocr_model, otr_model):
 
				                 is_success = 1
			
 
				             else:
			
 
				                 is_success = 0
			
 
				-            log("md5: " + str(_md5)
			
 
				-                + " finished result: " + str(text)
			
 
				-                + " is_success: " + str(is_success) + " "
			
 
				+            log("md5: " + str(_md5) + " "
			
 
				+                + "finished result: " + str(text) + " "
			
 
				+                + "is_success: " + str(is_success) + " "
			
 
				                 + str(_type) + " "
			
 
				-                + " " + str(time.time() - start_time))
			
 
				+                + "None "
			
 
				+                + str(round(time.time() - start_time, 2)))
			
 
				             return {"result_html": [str(text[0])], "result_text": [str(text[0])],
			
 
				                     "is_success": is_success, "swf_images": str(swf_images)}
			
 
				 
			
@@ -761,22 +979,24 @@ def convert_old(data, ocr_model, otr_model):
 
				         if only_text[0] == '' and len(only_text) <= 1:
			
 
				             print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
			
 
				             log("md5: " + str(_md5) + " "
			
 
				-                + " finished result: ['', 0] is_success: 1 "
			
 
				+                + "finished result: ['', 0] is_success: 1 "
			
 
				                 + str(_type) + " "
			
 
				-                + str(time.time() - start_time))
			
 
				+                + "None "
			
 
				+                + str(round(time.time() - start_time, 2)))
			
 
				         else:
			
 
				-            log("md5: " + str(_md5) +
			
 
				-                " finished result: " + str(only_text)[:20] + " "
			
 
				+            log("md5: " + str(_md5) + " "
			
 
				+                + "finished result: " + str(only_text)[:20] + " "
			
 
				                 + str(len(str(text))) + " is_success: 1 "
			
 
				                 + str(_type) + " "
			
 
				-                + str(time.time() - start_time))
			
 
				+                + "None "
			
 
				+                + str(round(time.time() - start_time, 2)))
			
 
				 
			
 
				         return {"result_html": text, "result_text": only_text,
			
 
				                 "is_success": 1, "swf_images": str(swf_images)}
			
 
				 
			
 
				     except ConnectionError:
			
 
				         log("convert post has no data!" + " failed result: [-2] is_success: 0 "
			
 
				-            + str(time.time() - start_time))
			
 
				+            + str(round(time.time() - start_time, 2)))
			
 
				         return {"result_html": ["-2"], "result_text": ["-2"],
			
 
				                 "is_success": 0, "swf_images": str([])}
			
 
				     except Exception as e:
			
@@ -801,9 +1021,9 @@ def test_more(_dir, process_no=None):
 
				     for p in file_path_list:
			
 
				         if i % 10 == 0:
			
 
				             if process_no is not None:
			
 
				-                print("Process", process_no, i, time.time()-start_time)
			
 
				+                print("Process", process_no, i, time.time() - start_time)
			
 
				             else:
			
 
				-                print("Loop", i, time.time()-start_time)
			
 
				+                print("Loop", i, time.time() - start_time)
			
 
				         test_one(p, from_remote=True)
			
 
				         i += 1
			
 
				 
			
@@ -847,79 +1067,28 @@ def test_duplicate(path_list, process_no=None):
 
				     for i in range(500):
			
 
				         if i % 10 == 0:
			
 
				             if process_no is not None:
			
 
				-                print("Process", process_no, i*len(path_list), time.time()-start_time)
			
 
				+                print("Process", process_no, i * len(path_list), time.time() - start_time)
			
 
				             else:
			
 
				-                print("Loop", i*len(path_list), time.time()-start_time)
			
 
				+                print("Loop", i * len(path_list), time.time() - start_time)
			
 
				         for p in path_list:
			
 
				             test_one(p, from_remote=True)
			
 
				 
			
 
				 
			
 
				-global_type = ""
			
 
				-local_url = "http://127.0.0.1"
			
 
				-if get_platform() == "Windows":
			
 
				-    _path = os.path.abspath(os.path.dirname(__file__))
			
 
				-else:
			
 
				-    _path = "/home/admin"
			
 
				-    if not os.path.exists(_path):
			
 
				-        _path = os.path.dirname(os.path.abspath(__file__))
			
 
				+# global_type = ""
			
 
				+# local_url = "http://127.0.0.1"
			
 
				+# if get_platform() == "Windows":
			
 
				+#     _path = os.path.abspath(os.path.dirname(__file__))
			
 
				+# else:
			
 
				+#     _path = "/home/admin"
			
 
				+#     if not os.path.exists(_path):
			
 
				+#         _path = os.path.dirname(os.path.abspath(__file__))
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    # convert interface
			
 
				-    if len(sys.argv) == 2:
			
 
				-        port = int(sys.argv[1])
			
 
				-    else:
			
 
				-        port = 15010
			
 
				-
			
 
				-    globals().update({"md5": "1"+"0"*15})
			
 
				+    port = 15010
			
 
				+    globals().update({"md5": "1" + "0" * 15})
			
 
				     globals().update({"port": str(port)})
			
 
				-    # _global._init()
			
 
				-    # _global.update({"md5": "1"+"0"*15})
			
 
				-    # _global.update({"port": str(port)})
			
 
				-
			
 
				-    # ip = get_intranet_ip()
			
 
				-    # log("my ip"+str(ip))
			
 
				-    # ip = "http://" + ip
			
 
				     ip_port_dict = get_ip_port()
			
 
				-
			
 
				     set_flask_global()
			
 
				+    app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
			
 
				 
			
 
				-    if get_platform() == "Windows":
			
 
				-        app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
			
 
				-    else:
			
 
				-        # app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
			
 
				-        app.run(port=15011)
			
 
				-
			
 
				-    # if get_platform() == "Windows":
			
 
				-    #     file_path = "C:/Users/Administrator/Desktop/test_image/error29.png"
			
 
				-    #     # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
			
 
				-    #     # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
			
 
				-    #     # file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
			
 
				-    # else:
			
 
				-    #     file_path = "test1.doc"
			
 
				-    # test_one(file_path, from_remote=True)
			
 
				-
			
 
				-    # if get_platform() == "Windows":
			
 
				-    #     file_dir = "D:/BIDI_DOC/比地_文档/table_images/"
			
 
				-    # else:
			
 
				-    #     file_dir = "../table_images/"
			
 
				-    #
			
 
				-    # for j in range(10):
			
 
				-    #     p = Process(target=test_more, args=(file_dir, j, ))
			
 
				-    #     p.start()
			
 
				-    # p.join()
			
 
				-
			
 
				-    # if get_platform() == "Windows":
			
 
				-    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
			
 
				-    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls",
			
 
				-    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/11111111.rar"]
			
 
				-    #     file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
			
 
				-    #                       "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"]
			
 
				-    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc"]
			
 
				-    #
			
 
				-    # else:
			
 
				-    #     file_path_list = ["test1.pdf"]
			
 
				-    # for j in range(10):
			
 
				-    #     p = Process(target=test_duplicate, args=(file_path_list, j, ))
			
 
				-    #     p.start()
			
 
				-    # p.join()
			
--- a/format_convert/convert_doc.py
+++ b/format_convert/convert_doc.py
@@ -6,7 +6,7 @@ import sys
 
				 import chardet
			
 
				 from bs4 import BeautifulSoup
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				-from format_convert.convert_tree import _Document, _Sentence, _Page
			
 
				+from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
			
 
				 import logging
			
 
				 import traceback
			
 
				 from format_convert import get_memory_info
			
@@ -35,11 +35,71 @@ def doc2text(path, unique_type_dir):
 
				 class DocConvert:
			
 
				     def __init__(self, path, unique_type_dir):
			
 
				         self._doc = _Document(path)
			
 
				+        self._page = _Page(None, 0)
			
 
				         self.path = path
			
 
				         self.unique_type_dir = unique_type_dir
			
 
				         self.tika_html = None
			
 
				+        print('into DocConvert __init__')
			
 
				 
			
 
				     def convert(self):
			
 
				+        print('into DocConvert convert')
			
 
				+        # 先判断特殊doc文件，可能是html文本
			
 
				+        # is_html_doc = False
			
 
				+        # try:
			
 
				+        #     try:
			
 
				+        #         with open(self.path, 'r') as f:
			
 
				+        #             html_str = f.read()
			
 
				+        #     except UnicodeDecodeError:
			
 
				+        #         with open(self.path, 'r', errors='ignore') as f:
			
 
				+        #             html_str = f.read()
			
 
				+        #     # if re.search('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str):
			
 
				+        #     if len(re.findall('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str)) >= 10:
			
 
				+        #         log('doc as html!')
			
 
				+        #         soup = BeautifulSoup(html_str, 'lxml')
			
 
				+        #         text = soup.text
			
 
				+        #         is_html_doc = True
			
 
				+        # except:
			
 
				+        #     pass
			
 
				+        #
			
 
				+        # if is_html_doc:
			
 
				+        #     self._page = _Page(None, 0)
			
 
				+        #     _sen = _Sentence(text, (0, 0, 0, 0))
			
 
				+        #     self._page.add_child(_sen)
			
 
				+        #     self._doc.add_child(self._page)
			
 
				+
			
 
				+        # 先判断特殊doc文件，可能是html文本
			
 
				+        is_html_doc = self.maybe_html()
			
 
				+
			
 
				+        if not is_html_doc:
			
 
				+            # 调用office格式转换
			
 
				+            file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
			
 
				+            if judge_error_code(file_path):
			
 
				+                # office转换失败，调用tika，提取各个类型对象
			
 
				+                try:
			
 
				+                    self.use_tika(self.path)
			
 
				+                except:
			
 
				+                    traceback.print_exc()
			
 
				+                    self._doc.error_code = [-17]
			
 
				+                    log('doc tika failed too')
			
 
				+                return
			
 
				+
			
 
				+            _docx = DocxConvert(file_path, self.unique_type_dir)
			
 
				+            _docx.convert()
			
 
				+            self._doc = _docx._doc
			
 
				+            # if self._doc.error_code is not None:
			
 
				+            #     # docx提取失败，调用tika，提取各个类型对象
			
 
				+            #     print('DocxConvert failed use_tika')
			
 
				+            #     self.use_tika(self.path)
			
 
				+            #     self._doc.error_code = None
			
 
				+            #     # # 调用tika提取
			
 
				+            #     # html = from_tika_interface(self.path)
			
 
				+            #     # if judge_error_code(html):
			
 
				+            #     #     self._doc.error_code = html
			
 
				+            #     # self.tika_html = html
			
 
				+            #     # self._doc.error_code = None
			
 
				+            #     return
			
 
				+
			
 
				+    def maybe_html(self):
			
 
				         # 先判断特殊doc文件，可能是html文本
			
 
				         is_html_doc = False
			
 
				         try:
			
@@ -63,27 +123,39 @@ class DocConvert:
 
				             _sen = _Sentence(text, (0, 0, 0, 0))
			
 
				             self._page.add_child(_sen)
			
 
				             self._doc.add_child(self._page)
			
 
				-        else:
			
 
				-            # 调用office格式转换
			
 
				-            file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
			
 
				-            if judge_error_code(file_path):
			
 
				-                # 调用tika提取
			
 
				-                html = from_tika_interface(self.path)
			
 
				-                if judge_error_code(html):
			
 
				-                    self._doc.error_code = html
			
 
				-                self.tika_html = html
			
 
				-                return
			
 
				-            _docx = DocxConvert(file_path, self.unique_type_dir)
			
 
				-            _docx.convert()
			
 
				-            self._doc = _docx._doc
			
 
				-            if self._doc.error_code is not None:
			
 
				-                # 调用tika提取
			
 
				-                html = from_tika_interface(self.path)
			
 
				-                if judge_error_code(html):
			
 
				-                    self._doc.error_code = html
			
 
				-                self.tika_html = html
			
 
				-                self._doc.error_code = None
			
 
				-                return
			
 
				+
			
 
				+        return is_html_doc
			
 
				+
			
 
				+    def use_tika(self, _path):
			
 
				+        # 调用tika提取
			
 
				+        # html = from_tika_interface(self.path)
			
 
				+        # if judge_error_code(html):
			
 
				+        #     self._doc.error_code = html
			
 
				+        # self.tika_html = html
			
 
				+        data = from_tika_interface(_path)
			
 
				+        if judge_error_code(data):
			
 
				+            self._doc.error_code = data
			
 
				+            return
			
 
				+        current_y = 5
			
 
				+        for di, d in enumerate(data):
			
 
				+            data_type, value = d
			
 
				+            bbox = [0, current_y, 20, current_y+10]
			
 
				+            current_y += 20
			
 
				+            if data_type == 'text':
			
 
				+                _sen = _Sentence(value, bbox)
			
 
				+                _sen.combine = False
			
 
				+                self._page.add_child(_sen)
			
 
				+            elif data_type == 'img':
			
 
				+                with open(value, "rb") as f:
			
 
				+                    img = f.read()
			
 
				+                _img = _Image(img, value, bbox)
			
 
				+                _img.is_from_docx = True
			
 
				+                self._page.add_child(_img)
			
 
				+            elif data_type == 'table':
			
 
				+                _table = _Table(value, bbox)
			
 
				+                _table.is_html = True
			
 
				+                self._page.add_child(_table)
			
 
				+        self._doc.add_child(self._page)
			
 
				 
			
 
				     def get_html(self):
			
 
				         try:
			
--- a/format_convert/convert_docx.py
+++ b/format_convert/convert_docx.py
@@ -10,7 +10,8 @@ import xml
 
				 import zipfile
			
 
				 import docx
			
 
				 from bs4 import BeautifulSoup
			
 
				-from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
			
 
				+from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code, \
			
 
				+    get_table_html
			
 
				 from format_convert.wrapt_timeout_decorator import timeout
			
 
				 from format_convert.convert_image import ImageConvert
			
 
				 from format_convert.convert_need_interface import from_tika_interface
			
@@ -313,7 +314,7 @@ def read_xml_order(unique_type_dir, document_xml, numbering_xml, document_xml_re
 
				 
			
 
				 @timeout(50, timeout_exception=TimeoutError)
			
 
				 def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_rels):
			
 
				-    def recursion_read_table(table):
			
 
				+    def recursion_read_table(table, show=0):
			
 
				         table_text = '<table border="1">'
			
 
				         tr_index = 0
			
 
				         tr_text_list = []
			
@@ -349,6 +350,7 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
 
				                             if is_merge == "continue":
			
 
				                                 row_span_dict[tc_index][0] += 1
			
 
				                                 tc_index += col_span
			
 
				+                                tc_text_list.append([tc_text, col_span])
			
 
				                                 # 跳过，不增加td
			
 
				                                 continue
			
 
				                                 # col_span_index = 0
			
@@ -403,6 +405,11 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
 
				                 tr_index += 1
			
 
				                 tr_text_list.append(tc_text_list)
			
 
				 
			
 
				+        if show:
			
 
				+            for row in tr_text_list:
			
 
				+                print('row', row)
			
 
				+                print('len(row)', len(row))
			
 
				+
			
 
				         # 替换所有row_span
			
 
				         for key in row_span_dict.keys():
			
 
				             row_span, finish_row_span_flag = row_span_dict.get(key)
			
@@ -420,7 +427,8 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
 
				         for node in body_nodes:
			
 
				             if 'w:tbl' in str(node).split(' '):
			
 
				                 _table = node
			
 
				-                _table_text = recursion_read_table(_table)
			
 
				+                # _table_text = recursion_read_table(_table)
			
 
				+                _table_text = xml_table_to_html(_table, unique_type_dir, numbering_xml, document_xml_rels)
			
 
				                 table_text_list.append(_table_text)
			
 
				         return table_text_list
			
 
				 
			
@@ -430,6 +438,146 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
 
				         return [-1]
			
 
				 
			
 
				 
			
 
				+def xml_table_to_html(table, unique_type_dir, numbering_xml, document_xml_rels, show=0):
			
 
				+    tr_index = 0
			
 
				+    tr_text_list = []
			
 
				+    last_node_level = 0
			
 
				+    num_pr_dict = {}
			
 
				+
			
 
				+    # 直接子节点用child表示，所有子节点用all表示
			
 
				+    for table_child in table.childNodes:
			
 
				+        if 'w:tr' in str(table_child):
			
 
				+            tr = table_child
			
 
				+            tr_child_nodes = tr.childNodes
			
 
				+            tc_index = 0
			
 
				+            tc_text_list = []
			
 
				+            for tr_child in tr_child_nodes:
			
 
				+                if 'w:tc' in str(tr_child).split(' '):
			
 
				+                    tc_text = ""
			
 
				+                    tc = tr_child
			
 
				+                    # 获取一格占多少列，相当于colspan
			
 
				+                    col_span = tc.getElementsByTagName("w:gridSpan")
			
 
				+                    if col_span:
			
 
				+                        col_span = int(col_span[0].getAttribute("w:val"))
			
 
				+                    else:
			
 
				+                        col_span = 1
			
 
				+                    # 获取是否是合并单元格的下一个空单元格，相当于rowspan
			
 
				+                    is_merge = tc.getElementsByTagName("w:vMerge")
			
 
				+                    if is_merge:
			
 
				+                        is_merge = is_merge[0].getAttribute("w:val")
			
 
				+                        if is_merge == "continue":
			
 
				+                            tc_index += col_span
			
 
				+                            tc_text = '@continue@'
			
 
				+                            tc_text_list.append([tc_text, col_span])
			
 
				+                            # 跳过，不增加td
			
 
				+                            continue
			
 
				+
			
 
				+                    # 放入文本
			
 
				+                    tc_child_nodes = tc.childNodes
			
 
				+                    for tc_child in tc_child_nodes:
			
 
				+                        # 处理嵌套在tc中的表格
			
 
				+                        if 'w:tbl' in str(tc_child).split(' '):
			
 
				+                            tc_text += xml_table_to_html(tc_child, unique_type_dir, numbering_xml, document_xml_rels)
			
 
				+                        # 处理编号
			
 
				+                        if 'w:p' in str(tc_child).split(' '):
			
 
				+                            _t_list, _, num_pr_dict, last_node_level = read_p_text(unique_type_dir,
			
 
				+                                                                                   tc_child,
			
 
				+                                                                                   last_node_level,
			
 
				+                                                                                   num_pr_dict,
			
 
				+                                                                                   numbering_xml,
			
 
				+                                                                                   document_xml_rels)
			
 
				+                            tc_text += ''.join(_t_list)
			
 
				+                    # 结束该tc
			
 
				+                    tc_index += col_span
			
 
				+                    tc_text_list.append([tc_text, col_span])
			
 
				+            # 结束该tr
			
 
				+            tr_index += 1
			
 
				+            tr_text_list.append(tc_text_list)
			
 
				+
			
 
				+    if show:
			
 
				+        for row in tr_text_list:
			
 
				+            print('row', row)
			
 
				+            print('len(row)', len(row))
			
 
				+
			
 
				+    table_html = row_list_to_table(tr_text_list)
			
 
				+    return table_html
			
 
				+
			
 
				+
			
 
				+def row_list_to_table(row_list, show=0):
			
 
				+    if show:
			
 
				+        print('='*50)
			
 
				+
			
 
				+    # 复制合并列
			
 
				+    new_row_list = []
			
 
				+    for row in row_list:
			
 
				+        new_row = []
			
 
				+        for col, col_span in row:
			
 
				+            new_row += [[col, col_span]]
			
 
				+            if col_span > 1:
			
 
				+                new_row += [[col, 0]] * (col_span - 1)
			
 
				+        new_row_list.append(new_row)
			
 
				+    row_list = new_row_list
			
 
				+
			
 
				+    if show:
			
 
				+        for row in row_list:
			
 
				+            print('copy row', row)
			
 
				+
			
 
				+    # 计算是不是每行都有相等列数
			
 
				+    row_cnt_list = []
			
 
				+    for row in row_list:
			
 
				+        row_cnt_list.append(len(row))
			
 
				+
			
 
				+    if len(set(row_cnt_list)) != 1:
			
 
				+        log('表格有列数不同，直接返回text' + str(row_cnt_list))
			
 
				+        # 直接返回所有col的text
			
 
				+        text = ''
			
 
				+        for row in row_list:
			
 
				+            for col, col_span in row:
			
 
				+                text += col
			
 
				+        return text
			
 
				+
			
 
				+    new_row_list = []
			
 
				+    for ri, row in enumerate(row_list):
			
 
				+        new_row = []
			
 
				+        for ci, col in enumerate(row):
			
 
				+            col, col_span = col
			
 
				+            row_span = 1
			
 
				+            # 判断下面行同列有没有需合并的
			
 
				+            for ri2 in range(ri+1, len(row_list)):
			
 
				+                col2, col_span2 = row_list[ri2][ci]
			
 
				+                if col2 == '@continue@':
			
 
				+                    row_span += 1
			
 
				+                else:
			
 
				+                    break
			
 
				+
			
 
				+            # 需跳过的列
			
 
				+            if col == '@continue@' or col_span == 0:
			
 
				+                delete = 1
			
 
				+            else:
			
 
				+                delete = 0
			
 
				+
			
 
				+            col_dict = {
			
 
				+                'text': col,
			
 
				+                'rowspan': row_span,
			
 
				+                'columnspan': col_span,
			
 
				+                'delete': delete,
			
 
				+            }
			
 
				+            new_row.append(col_dict)
			
 
				+        new_row_list.append(new_row)
			
 
				+
			
 
				+    if show:
			
 
				+        for new_row in new_row_list:
			
 
				+            print('new_row', new_row)
			
 
				+
			
 
				+    table_html = get_table_html(new_row_list)
			
 
				+
			
 
				+    # soup = BeautifulSoup(table_html, 'lxml')
			
 
				+    # print(soup.prettify())
			
 
				+    if show:
			
 
				+        print('-' * 50)
			
 
				+    return table_html
			
 
				+
			
 
				+
			
 
				 @timeout(25, timeout_exception=TimeoutError)
			
 
				 def parse_xml(path):
			
 
				     # 解析xml
			
@@ -449,6 +597,7 @@ def parse_xml2(path):
 
				 class DocxConvert:
			
 
				     def __init__(self, path, unique_type_dir):
			
 
				         self._doc = _Document(path)
			
 
				+        self._page = _Page(None, 0)
			
 
				         self.path = path
			
 
				         self.unique_type_dir = unique_type_dir
			
 
				 
			
@@ -497,8 +646,6 @@ class DocxConvert:
 
				             self._doc.error_code = [-3]
			
 
				 
			
 
				     def convert(self):
			
 
				-        self._page = _Page(None, 0)
			
 
				-
			
 
				         # 先判断特殊doc文件，可能是html文本
			
 
				         is_html_doc = False
			
 
				         try:
			
@@ -630,23 +777,62 @@ class DocxConvert:
 
				     def get_doc_object(self):
			
 
				         return self._doc
			
 
				 
			
 
				+    def use_tika(self, _path):
			
 
				+        # 调用tika提取
			
 
				+        # html = from_tika_interface(self.path)
			
 
				+        # if judge_error_code(html):
			
 
				+        #     self._doc.error_code = html
			
 
				+        # self.tika_html = html
			
 
				+        data = from_tika_interface(_path)
			
 
				+        if judge_error_code(data):
			
 
				+            self._doc.error_code = data
			
 
				+            return
			
 
				+        current_y = 5
			
 
				+        for di, d in enumerate(data):
			
 
				+            data_type, value = d
			
 
				+            bbox = [0, current_y, 20, current_y+10]
			
 
				+            current_y += 20
			
 
				+            if data_type == 'text':
			
 
				+                _sen = _Sentence(value, bbox)
			
 
				+                _sen.combine = False
			
 
				+                self._page.add_child(_sen)
			
 
				+            elif data_type == 'img':
			
 
				+                with open(value, "rb") as f:
			
 
				+                    img = f.read()
			
 
				+                _img = _Image(img, value, bbox)
			
 
				+                _img.is_from_docx = True
			
 
				+                self._page.add_child(_img)
			
 
				+            elif data_type == 'table':
			
 
				+                _table = _Table(value, bbox)
			
 
				+                _table.is_html = True
			
 
				+                self._page.add_child(_table)
			
 
				+        self._doc.add_child(self._page)
			
 
				+
			
 
				     def get_html(self):
			
 
				         if self._doc.error_code is not None:
			
 
				             return self._doc.error_code
			
 
				         try:
			
 
				+            # raise
			
 
				             self.convert()
			
 
				         except:
			
 
				             traceback.print_exc()
			
 
				             self._doc.error_code = [-1]
			
 
				         # log('docx error code ' + str(self._doc.error_code))
			
 
				         if self._doc.error_code is not None:
			
 
				-            # 调用tika提取
			
 
				-            html = from_tika_interface(self.path)
			
 
				-            if judge_error_code(html):
			
 
				-                self._doc.error_code = html
			
 
				-                return self._doc.error_code
			
 
				-            else:
			
 
				-                return [html]
			
 
				+            # # 调用tika提取
			
 
				+            # html = from_tika_interface(self.path)
			
 
				+            # if judge_error_code(html):
			
 
				+            #     self._doc.error_code = html
			
 
				+            #     return self._doc.error_code
			
 
				+            # else:
			
 
				+            #     return [html]
			
 
				+            try:
			
 
				+                self.use_tika(self.path)
			
 
				+                self._doc.error_code = None
			
 
				+            except:
			
 
				+                traceback.print_exc()
			
 
				+                log('docx tika failed too')
			
 
				+                self._doc.error_code = [-17]
			
 
				         return self._doc.get_html()
			
 
				 
			
 
				 
			
@@ -791,9 +977,10 @@ class DocxConvertNew:
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    c = DocxConvert("C:/Users/Administrator/Downloads/dsdsd.docx", "C:/Users/Administrator/Downloads/1/")
			
 
				-    print(c.get_html())
			
 
				-
			
 
				-    # c = DocxConvertNew()
			
 
				-    # # c.read_docx(r'C:\Users\Administrator\Desktop\test_doc\error14.docx')
			
 
				-    # c.read_docx(r'C:/Users/Administrator/Downloads/dsdsd.docx')
			
 
				+    _p = r'C:/Users/Administrator/Downloads/1723004790329.docx'
			
 
				+    # _p = "C:/Users/Administrator/Desktop/test_doc/error14.docx"
			
 
				+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
			
 
				+    c = DocxConvert(_p, save_dir)
			
 
				+    _html = c.get_html()
			
 
				+    with open('../result.html', 'w', encoding='utf-8') as f:
			
 
				+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + str(_html[0]))
			
--- a/format_convert/convert_image.py
+++ b/format_convert/convert_image.py
@@ -21,7 +21,7 @@ from format_convert.utils import judge_error_code, add_div, LineTable, get_table
 
				 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface, from_gpu_interface_redis, \
			
 
				     from_idc_interface, from_isr_interface
			
 
				 from format_convert.table_correct import get_rotated_image
			
 
				-from botr.extract_table import get_table
			
 
				+from botr.extract_table import get_table, get_b_table_by_blank_colon
			
 
				 
			
 
				 
			
 
				 def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
			
@@ -66,7 +66,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				     def merge_textbox(textbox_list, in_objs):
			
 
				         delete_obj = []
			
 
				         threshold = 5
			
 
				-        textbox_list.sort(key=lambda x:x.bbox[0])
			
 
				+        textbox_list.sort(key=lambda x: x.bbox[0])
			
 
				         for k in range(len(textbox_list)):
			
 
				             tb1 = textbox_list[k]
			
 
				             if tb1 not in in_objs and tb1 not in delete_obj:
			
@@ -74,6 +74,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				                     tb2 = textbox_list[m]
			
 
				                     if tb2 in in_objs:
			
 
				                         continue
			
 
				+                    # print('tb1 tb2', tb1, tb2)
			
 
				                     if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
			
 
				                             and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
			
 
				                         if tb1.bbox[0] <= tb2.bbox[0]:
			
@@ -88,9 +89,9 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				                 textbox_list.remove(_obj)
			
 
				         return textbox_list
			
 
				 
			
 
				-    def resize_process(_image_np):
			
 
				+    def resize_process(_image_np, threshold=2048):
			
 
				+    # def resize_process(_image_np, threshold=1280):
			
 
				         # 整体分辨率限制
			
 
				-        threshold = 2048
			
 
				         if _image_np.shape[0] > threshold or _image_np.shape[1] > threshold:
			
 
				             h, w = get_best_predict_size2(_image_np, threshold=threshold)
			
 
				             log("global image resize " + str(_image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
			
@@ -169,14 +170,24 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				         log("isr total time "+str(time.time()-_isr_time))
			
 
				         return _image_np
			
 
				 
			
 
				-    def ocr_process(_image_np, _threshold=2048):
			
 
				+    # def ocr_process(_image_np, _threshold=2048):
			
 
				+    def ocr_process(_image_np, _threshold=1080):
			
 
				         log("ocr_process image shape " + str(_image_np.shape))
			
 
				 
			
 
				+        # 过小直接返回
			
 
				+        if _image_np.shape[0] <= 10 or _image_np.shape[1] <= 10:
			
 
				+            return [], []
			
 
				+        if _image_np.shape[0] < 50 and _image_np.shape[1] / _image_np.shape[0] > 20:
			
 
				+            return [], []
			
 
				+        if _image_np.shape[1] < 50 and _image_np.shape[0] / _image_np.shape[1] > 20:
			
 
				+            return [], []
			
 
				+
			
 
				         # ocr图片过大内存溢出，需resize
			
 
				         # 大图按比例缩小，小图维持不变；若统一拉伸成固定大小如1024会爆显存
			
 
				         ratio = (1, 1)
			
 
				         if _image_np.shape[0] > _threshold or _image_np.shape[1] > _threshold:
			
 
				-            best_h, best_w = get_best_predict_size2(_image_np, _threshold)
			
 
				+            # best_h, best_w = get_best_predict_size2(_image_np, _threshold)
			
 
				+            best_h, best_w = get_best_predict_size_by_area(_image_np, _threshold)
			
 
				             _image_np = pil_resize(_image_np, best_h, best_w)
			
 
				             log("ocr_process image resize " + str(_image_np.shape))
			
 
				             ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
			
@@ -189,7 +200,13 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				 
			
 
				         # 调用ocr模型接口
			
 
				         image_bytes = np2bytes(_image_np)
			
 
				-        text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
			
 
				+        result = from_ocr_interface(image_bytes, is_table=1)
			
 
				+        # print('from_ocr_interface result ', result)
			
 
				+        if len(result) != 2:
			
 
				+            return result, result
			
 
				+
			
 
				+        text_list, bbox_list = result
			
 
				+        # text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
			
 
				         if judge_error_code(text_list):
			
 
				             return text_list, text_list
			
 
				 
			
@@ -264,6 +281,13 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				 
			
 
				     def botr_process(_image_np, table_list2, text_list2, box_list2, text_box_list2, obj_in_table_list2,
			
 
				                      from_pdf=False, pdf_obj_list=[], pdf_layout_size=()):
			
 
				+
			
 
				+        temp_list = []
			
 
				+        for _table2 in table_list2:
			
 
				+            _table2 = _Table(_table2["table"], _table2["bbox"])
			
 
				+            temp_list.append(_table2)
			
 
				+        table_list2 = temp_list
			
 
				+
			
 
				         if from_pdf:
			
 
				             # 交叉验证 ocr结果与pdf obj，暂时使用pdf提取的
			
 
				             h_ratio = _image_np.shape[0] / pdf_layout_size[1]
			
@@ -300,14 +324,55 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				             box_list2 = pdf_box_list
			
 
				             text_box_list2 = pdf_text_box_list
			
 
				 
			
 
				-        _text_box_list, _table_list, _obj_in_table_list = get_table(_image_np, table_list2, text_list2, box_list2, text_box_list2)
			
 
				-
			
 
				-        # 保存无边框表格文件
			
 
				-        if _table_list:
			
 
				+            _b_table_list = []
			
 
				+            _not_b_table_list = []
			
 
				+        else:
			
 
				+            # 无边框新规则，补充添加 2505015
			
 
				+            # 根据text规律，判断该页是否可能有无边框表格
			
 
				             try:
			
 
				-                save_b_table(_image_np, text_box_list2, from_pdf)
			
 
				+                _b_table_list, _not_b_table_list = get_b_table_by_blank_colon(text_box_list2, table_list2, (
			
 
				+                0, 0, _image_np.shape[1], _image_np.shape[0]), _image_np)
			
 
				             except:
			
 
				-                pass
			
 
				+                traceback.print_exc()
			
 
				+                return [-23], [], []
			
 
				+
			
 
				+            # print('_b_table_list111', _b_table_list)
			
 
				+            if _b_table_list:
			
 
				+                temp_list = []
			
 
				+                for _b_table in _b_table_list:
			
 
				+                    _b_table = _Table(_b_table[0], _b_table[1])
			
 
				+                    # table_list2 += [_b_table]
			
 
				+                    temp_list.append(_b_table)
			
 
				+                _b_table_list = temp_list
			
 
				+            if _not_b_table_list:
			
 
				+                temp_list = []
			
 
				+                for _b_table in _not_b_table_list:
			
 
				+                    _b_table = _Table(_b_table[0], _b_table[1])
			
 
				+                    temp_list.append(_b_table)
			
 
				+                _not_b_table_list = temp_list
			
 
				+
			
 
				+        ignore_table_list = table_list2 + _b_table_list + _not_b_table_list
			
 
				+        # yolo检测出的表格，忽略两列的，因为已经补充了两列的新规则 250529
			
 
				+        _text_box_list, _table_list, _obj_in_table_list = get_table(_image_np, ignore_table_list, text_list2, box_list2, text_box_list2, from_pdf=from_pdf)
			
 
				+        # print('_table_list', _table_list)
			
 
				+        # print('_b_table_list222', _b_table_list)
			
 
				+
			
 
				+        # 无边框新规则，补充添加 2505015
			
 
				+        _table_list = [_Table(x.get('table'), x.get('bbox')) for x in _table_list]
			
 
				+        _table_list += _b_table_list
			
 
				+        for _b_table in _b_table_list:
			
 
				+            for _text_box in text_box_list2:
			
 
				+                if _b_table.bbox[1] <= _text_box.bbox[1] <= _text_box.bbox[3] <= _b_table.bbox[3]:
			
 
				+                    # print('add _obj_in_table_list 250515', _text_box)
			
 
				+                    _obj_in_table_list.append(_text_box)
			
 
				+        # print('_b_table_list233', _table_list)
			
 
				+
			
 
				+        # 保存无边框表格文件
			
 
				+        # if _table_list:
			
 
				+        #     try:
			
 
				+        #         save_b_table(_image_np, text_box_list2, from_pdf)
			
 
				+        #     except:
			
 
				+        #         pass
			
 
				 
			
 
				         # print('_text_box_list', _text_box_list)
			
 
				         # print('_table_list', _table_list)
			
@@ -496,7 +561,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				             else:
			
 
				                 # 根据index拆开图片，重新ocr
			
 
				                 split_index_list.insert(0, 0)
			
 
				-                print('split_index_list1', split_index_list)
			
 
				+                # print('split_index_list1', split_index_list)
			
 
				                 for _i, index in enumerate(split_index_list):
			
 
				                     if _i == len(split_index_list) - 1:
			
 
				                         split_image_np = sub_image_np[:, index:, :]
			
@@ -602,12 +667,12 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				                 # 生成TextBox对象
			
 
				                 text_box_list = get_text_box_obj(text_list, box_list)
			
 
				                 # for t in text_box_list:
			
 
				-                #     print('text_box0', t.get_text())
			
 
				+                #     print('text_box0', t)
			
 
				 
			
 
				                 # 表格生成
			
 
				                 text_box_list, table_list, obj_in_table_list = table_process(line_list, text_box_list, image_np)
			
 
				                 # for t in text_box_list:
			
 
				-                #     print('text_box1', t.get_text())
			
 
				+                #     print('text_box1', t)
			
 
				                 # print('table_list', table_list)
			
 
				                 # for t in obj_in_table_list:
			
 
				                 #     print('obj_text_box2', t.get_text())
			
@@ -625,10 +690,20 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				                                                                                 pdf_layout_size,
			
 
				                                                                                 )
			
 
				                 log('botr process cost: ' + str(time.time()-start_time))
			
 
				+                if judge_error_code(text_box_list):
			
 
				+                    return text_box_list
			
 
				+
			
 
				+                # print('b_table_list333', b_table_list)
			
 
				+                obj_in_table_list.update(set(b_obj_in_table_list))
			
 
				+                # for t in text_box_list:
			
 
				+                #     print('text_box2', t)
			
 
				 
			
 
				                 # 合并非表格的同一行TextBox
			
 
				                 text_box_list = merge_textbox(text_box_list, obj_in_table_list)
			
 
				 
			
 
				+                # for t in text_box_list:
			
 
				+                #     print('text_box3', t)
			
 
				+                # print('table_list, b_table_list', table_list, b_table_list)
			
 
				                 table_textbox_list.append([table_list, b_table_list, obj_in_table_list, text_box_list])
			
 
				 
			
 
				             if reverse_flag:
			
@@ -649,16 +724,21 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				             _add_y = 0
			
 
				             for table_list, b_table_list, obj_in_table_list, text_box_list in table_textbox_list:
			
 
				                 obj_list = []
			
 
				+                # print('obj_in_table_list', obj_in_table_list)
			
 
				                 for table in table_list:
			
 
				-                    _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
			
 
				+                    _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y,
			
 
				+                                   table["bbox"][2], table["bbox"][3] + _add_y]
			
 
				                     _table = _Table(table["table"], _table_bbox)
			
 
				+                    # print('_table.bbo2x', _table.bbox)
			
 
				                     obj_list.append(_table)
			
 
				                 for table in b_table_list:
			
 
				-                    _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
			
 
				-                    _table = _Table(table["table"], _table_bbox)
			
 
				-                    obj_list.append(_table)
			
 
				+                    # _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
			
 
				+                    # _table = _Table(table["table"], _table_bbox)
			
 
				+                    # print('table.bbo1x', table.bbox)
			
 
				+                    obj_list.append(table)
			
 
				                 for text_box in text_box_list:
			
 
				                     if text_box not in obj_in_table_list:
			
 
				+                        # print('text_box',  text_box)
			
 
				                         text_box.bbox[1] += _add_y
			
 
				                         obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
			
 
				 
			
@@ -707,6 +787,8 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				                                                                         pdf_layout_size,
			
 
				                                                                         )
			
 
				             log('botr process cost: ' + str(time.time()-start_time))
			
 
				+            if judge_error_code(text_box_list):
			
 
				+                return text_box_list
			
 
				 
			
 
				             # 合并非表格的同一行TextBox
			
 
				             text_box_list = merge_textbox(text_box_list, obj_in_table_list)
			
@@ -715,8 +797,10 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				             obj_list = []
			
 
				             # print('table_list', table_list)
			
 
				             for table in table_list:
			
 
				-                _table = _Table(table["table"], table["bbox"])
			
 
				-                obj_list.append(_table)
			
 
				+                # print('type(table)', type(table))
			
 
				+                # _table = _Table(table["table"], table["bbox"])
			
 
				+                # print('table.bbox', table.bbox)
			
 
				+                obj_list.append(table)
			
 
				             for text_box in text_box_list:
			
 
				                 if text_box not in obj_in_table_list:
			
 
				                     obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
			
@@ -732,6 +816,690 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				         return [-1]
			
 
				 
			
 
				 
			
 
				+# class ImageProcess:
			
 
				+#     def __init__(self, image_np, image_path, is_from_pdf=False, is_from_docx=False,
			
 
				+#                  b_table_from_text=False, pdf_obj_list=[], pdf_layout_size=(),
			
 
				+#                  is_reverse=False):
			
 
				+#
			
 
				+#         self.image_np = image_np
			
 
				+#         self.image_path = image_path
			
 
				+#         self.is_from_pdf = is_from_pdf
			
 
				+#         self.is_from_docx = is_from_docx
			
 
				+#         self.b_table_from_text = b_table_from_text
			
 
				+#         self.pdf_obj_list = pdf_obj_list
			
 
				+#         self.pdf_layout_size = pdf_layout_size
			
 
				+#         self.is_reverse = is_reverse
			
 
				+#
			
 
				+#     def merge_textbox(self, textbox_list, in_objs):
			
 
				+#         delete_obj = []
			
 
				+#         threshold = 5
			
 
				+#         textbox_list.sort(key=lambda x:x.bbox[0])
			
 
				+#         for k in range(len(textbox_list)):
			
 
				+#             tb1 = textbox_list[k]
			
 
				+#             if tb1 not in in_objs and tb1 not in delete_obj:
			
 
				+#                 for m in range(k+1, len(textbox_list)):
			
 
				+#                     tb2 = textbox_list[m]
			
 
				+#                     if tb2 in in_objs:
			
 
				+#                         continue
			
 
				+#                     if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
			
 
				+#                             and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
			
 
				+#                         if tb1.bbox[0] <= tb2.bbox[0]:
			
 
				+#                             tb1.text = tb1.text + tb2.text
			
 
				+#                         else:
			
 
				+#                             tb1.text = tb2.text + tb1.text
			
 
				+#                         tb1.bbox[0] = min(tb1.bbox[0], tb2.bbox[0])
			
 
				+#                         tb1.bbox[2] = max(tb1.bbox[2], tb2.bbox[2])
			
 
				+#                         delete_obj.append(tb2)
			
 
				+#         for _obj in delete_obj:
			
 
				+#             if _obj in textbox_list:
			
 
				+#                 textbox_list.remove(_obj)
			
 
				+#         return textbox_list
			
 
				+#
			
 
				+#     def resize_process(self, _image_np):
			
 
				+#         # 整体分辨率限制
			
 
				+#         threshold = 2048
			
 
				+#         if _image_np.shape[0] > threshold or _image_np.shape[1] > threshold:
			
 
				+#             h, w = get_best_predict_size2(_image_np, threshold=threshold)
			
 
				+#             log("global image resize " + str(_image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
			
 
				+#             _image_np = pil_resize(_image_np, h, w)
			
 
				+#         return _image_np
			
 
				+#
			
 
				+#     def idc_process(self, _image_np, return_angle=False):
			
 
				+#         # 图片倾斜校正，写入原来的图片路径
			
 
				+#         # print("image_process", image_path)
			
 
				+#         # g_r_i = get_rotated_image(_image_np, image_path)
			
 
				+#         # if judge_error_code(g_r_i):
			
 
				+#         #     if is_from_docx:
			
 
				+#         #         return []
			
 
				+#         #     else:
			
 
				+#         #         return g_r_i
			
 
				+#         # _image_np = cv2.imread(image_path)
			
 
				+#         # if _image_np is None:
			
 
				+#         #     return []
			
 
				+#         # return _image_np
			
 
				+#
			
 
				+#         # if _image_np is None:
			
 
				+#         #     return []
			
 
				+#
			
 
				+#         # idc模型实现图片倾斜校正
			
 
				+#         h, w = get_best_predict_size2(_image_np, 1080)
			
 
				+#         image_resize = pil_resize(_image_np, h, w)
			
 
				+#         # image_resize_path = image_path.split(".")[0] + "_resize_idc." + image_path.split(".")[-1]
			
 
				+#         # cv2.imwrite(image_resize_path, image_resize)
			
 
				+#
			
 
				+#         # with open(image_resize_path, "rb") as f:
			
 
				+#         #     image_bytes = f.read()
			
 
				+#         image_bytes = np2bytes(image_resize)
			
 
				+#         angle = from_idc_interface(image_bytes)
			
 
				+#         log('idc_process angle ' + str(angle))
			
 
				+#         if judge_error_code(angle):
			
 
				+#             if return_angle:
			
 
				+#                 if self.is_from_docx:
			
 
				+#                     return [], []
			
 
				+#                 else:
			
 
				+#                     return angle, angle
			
 
				+#             else:
			
 
				+#                 if self.is_from_docx:
			
 
				+#                     return []
			
 
				+#                 else:
			
 
				+#                     return angle
			
 
				+#         # 根据角度旋转
			
 
				+#         # _image_pil = Image.fromarray(_image_np)
			
 
				+#         # _image_np = np.array(_image_pil.rotate(angle, expand=1))
			
 
				+#         _image_np = image_rotate(_image_np, angle)
			
 
				+#
			
 
				+#         # 写入
			
 
				+#         # idc_path = image_path.split(".")[0] + "_idc." + image_path.split(".")[-1]
			
 
				+#         # cv2.imwrite(idc_path, image_np)
			
 
				+#         if return_angle:
			
 
				+#             return _image_np, angle
			
 
				+#         return _image_np
			
 
				+#
			
 
				+#     def isr_process(self, _image_np):
			
 
				+#         log("isr_process image shape " + str(_image_np.shape))
			
 
				+#         image_np_copy = copy.deepcopy(_image_np)
			
 
				+#         # isr模型去除印章
			
 
				+#         _isr_time = time.time()
			
 
				+#         if count_red_pixel(_image_np):
			
 
				+#             # 红色像素达到一定值才过模型
			
 
				+#             image_bytes = np2bytes(_image_np)
			
 
				+#             _image_np = from_isr_interface(image_bytes)
			
 
				+#             if judge_error_code(_image_np):
			
 
				+#                 if self.is_from_docx:
			
 
				+#                     return []
			
 
				+#                 else:
			
 
				+#                     return _image_np
			
 
				+#             # [1]代表检测不到印章，直接返回
			
 
				+#             if isinstance(_image_np, list) and _image_np == [1]:
			
 
				+#                 log("no seals detected!")
			
 
				+#                 _image_np = image_np_copy
			
 
				+#         log("isr total time "+str(time.time()-_isr_time))
			
 
				+#         return _image_np
			
 
				+#
			
 
				+#     def ocr_process(self, _image_np, _threshold=2048):
			
 
				+#         log("ocr_process image shape " + str(_image_np.shape))
			
 
				+#
			
 
				+#         # ocr图片过大内存溢出，需resize
			
 
				+#         # 大图按比例缩小，小图维持不变；若统一拉伸成固定大小如1024会爆显存
			
 
				+#         ratio = (1, 1)
			
 
				+#         h, w = _image_np.shape[:2]
			
 
				+#         if _image_np.shape[0] > _threshold or _image_np.shape[1] > _threshold:
			
 
				+#             best_h, best_w = get_best_predict_size2(_image_np, _threshold)
			
 
				+#             _image_np = pil_resize(_image_np, best_h, best_w)
			
 
				+#             log("ocr_process image resize " + str(_image_np.shape))
			
 
				+#             ratio = (h/best_h, w/best_w)
			
 
				+#
			
 
				+#         # 大图片ocr加锁，防止爆显存
			
 
				+#         # if _image_np.shape[0] >= 1024 and _image_np.shape[1] >= 1024:
			
 
				+#         #     file_lock = True
			
 
				+#         # else:
			
 
				+#         #     file_lock = False
			
 
				+#
			
 
				+#         # 调用ocr模型接口
			
 
				+#         image_bytes = np2bytes(_image_np)
			
 
				+#         text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
			
 
				+#         if judge_error_code(text_list):
			
 
				+#             return text_list, text_list
			
 
				+#
			
 
				+#         for i in range(len(bbox_list)):
			
 
				+#             point = bbox_list[i]
			
 
				+#             bbox_list[i] = [[int(point[0][0]*ratio[0]), int(point[0][1]*ratio[1])],
			
 
				+#                             [int(point[1][0]*ratio[0]), int(point[1][1]*ratio[1])],
			
 
				+#                             [int(point[2][0]*ratio[0]), int(point[2][1]*ratio[1])],
			
 
				+#                             [int(point[3][0]*ratio[0]), int(point[3][1]*ratio[1])]]
			
 
				+#
			
 
				+#         # 去除水印字 根据识别是否为矩形框
			
 
				+#         temp_text_list = []
			
 
				+#         temp_bbox_list = []
			
 
				+#         water_mark_dict = {}
			
 
				+#         for i in range(len(bbox_list)):
			
 
				+#             bbox = bbox_list[i]
			
 
				+#             text = text_list[i]
			
 
				+#             if len(re.findall('[\u4e00-\u9fa5]', text)) == len(text):
			
 
				+#                 if (abs(bbox[0][1] - bbox[1][1]) <= 2 and abs(bbox[2][1] - bbox[3][1]) <= 2) \
			
 
				+#                         or (abs(bbox[0][0] - bbox[3][0]) <= 4 and abs(bbox[2][0] - bbox[1][0]) <= 4):
			
 
				+#                     temp_text_list.append(text)
			
 
				+#                     temp_bbox_list.append(bbox)
			
 
				+#                 else:
			
 
				+#                     if text in water_mark_dict.keys():
			
 
				+#                         water_mark_dict[text] += [bbox]
			
 
				+#                     else:
			
 
				+#                         water_mark_dict[text] = [bbox]
			
 
				+#             else:
			
 
				+#                 temp_text_list.append(text)
			
 
				+#                 temp_bbox_list.append(bbox)
			
 
				+#
			
 
				+#         # 数量多的才算水印
			
 
				+#         for text in water_mark_dict.keys():
			
 
				+#             bbox_list = water_mark_dict.get(text)
			
 
				+#             if len(bbox_list) < 3:
			
 
				+#                 for bbox in bbox_list:
			
 
				+#                     temp_text_list.append(text)
			
 
				+#                     temp_bbox_list.append(bbox)
			
 
				+#
			
 
				+#         text_list = temp_text_list
			
 
				+#         bbox_list = temp_bbox_list
			
 
				+#         return text_list, bbox_list
			
 
				+#
			
 
				+#     def otr_process(self, _image_np):
			
 
				+#         log("otr_process image shape " + str(_image_np.shape))
			
 
				+#         # otr模型识别表格，需要图片resize成模型所需大小, 写入另一个路径
			
 
				+#         best_h, best_w = get_best_predict_size(_image_np)
			
 
				+#         image_resize = pil_resize(_image_np, best_h, best_w)
			
 
				+#         # image_resize_path = image_path.split(".")[0] + "_resize_otr." + image_path.split(".")[-1]
			
 
				+#         # cv2.imwrite(image_resize_path, image_resize)
			
 
				+#
			
 
				+#         # 调用otr模型接口
			
 
				+#         # with open(image_resize_path, "rb") as f:
			
 
				+#         #     image_bytes = f.read()
			
 
				+#         image_bytes = np2bytes(image_resize)
			
 
				+#         list_line = from_otr_interface(image_bytes, self.is_from_pdf)
			
 
				+#         if judge_error_code(list_line):
			
 
				+#             if self.is_from_docx:
			
 
				+#                 return []
			
 
				+#             else:
			
 
				+#                 return list_line
			
 
				+#
			
 
				+#         # otr resize后得到的bbox根据比例还原
			
 
				+#         start_time = time.time()
			
 
				+#         ratio = (_image_np.shape[0]/best_h, _image_np.shape[1]/best_w)
			
 
				+#         for i in range(len(list_line)):
			
 
				+#             point = list_line[i]
			
 
				+#             list_line[i] = [int(point[0]*ratio[1]), int(point[1]*ratio[0]),
			
 
				+#                             int(point[2]*ratio[1]), int(point[3]*ratio[0])]
			
 
				+#         log("otr resize bbox recover " + str(time.time()-start_time))
			
 
				+#         return list_line
			
 
				+#
			
 
				+#     def botr_process(self, _image_np, table_list2, text_list2, box_list2, text_box_list2, obj_in_table_list2,
			
 
				+#                      from_pdf=False, pdf_obj_list=[], pdf_layout_size=()):
			
 
				+#         if from_pdf:
			
 
				+#             # 交叉验证 ocr结果与pdf obj，暂时使用pdf提取的
			
 
				+#             h_ratio = _image_np.shape[0] / pdf_layout_size[1]
			
 
				+#             w_ratio = _image_np.shape[1] / pdf_layout_size[0]
			
 
				+#             pdf_text_list = []
			
 
				+#             pdf_box_list = []
			
 
				+#             for obj in pdf_obj_list:
			
 
				+#                 if obj.get_text() in ["", " "]:
			
 
				+#                     continue
			
 
				+#
			
 
				+#                 # pdf坐标是上下颠倒的
			
 
				+#                 # obj.bbox = (obj.bbox[0], pdf_layout_size[1]-obj.bbox[3],
			
 
				+#                 #             obj.bbox[2], pdf_layout_size[1]-obj.bbox[1])
			
 
				+#
			
 
				+#                 # 根据两个页面大小比例调整坐标
			
 
				+#                 obj.bbox = (obj.bbox[0]*w_ratio, obj.bbox[1]*h_ratio,
			
 
				+#                             obj.bbox[2]*w_ratio, obj.bbox[3]*h_ratio)
			
 
				+#
			
 
				+#                 # 剔除水印字
			
 
				+#                 text = re.sub('[\n ]', '', obj.get_text())
			
 
				+#                 if len(text) == 1 and abs(obj.bbox[0] - obj.bbox[2]) >= 70:
			
 
				+#                     continue
			
 
				+#
			
 
				+#                 pdf_box_list.append([[int(obj.bbox[0]), int(obj.bbox[1])],
			
 
				+#                                      [],
			
 
				+#                                      [int(obj.bbox[2]), int(obj.bbox[3])],
			
 
				+#                                      []
			
 
				+#                                      ])
			
 
				+#                 pdf_text_list.append(re.sub('[\n]', '', obj.get_text()))
			
 
				+#
			
 
				+#             pdf_text_box_list = self.get_text_box_obj(pdf_text_list, pdf_box_list)
			
 
				+#
			
 
				+#             text_list2 = pdf_text_list
			
 
				+#             box_list2 = pdf_box_list
			
 
				+#             text_box_list2 = pdf_text_box_list
			
 
				+#
			
 
				+#         _text_box_list, _table_list, _obj_in_table_list = get_table(_image_np, table_list2, text_list2, box_list2, text_box_list2, from_pdf=from_pdf)
			
 
				+#
			
 
				+#         # 保存无边框表格文件
			
 
				+#         if _table_list:
			
 
				+#             try:
			
 
				+#                 self.save_b_table(_image_np, text_box_list2, from_pdf)
			
 
				+#             except:
			
 
				+#                 pass
			
 
				+#
			
 
				+#         # print('_text_box_list', _text_box_list)
			
 
				+#         # print('_table_list', _table_list)
			
 
				+#         if from_pdf:
			
 
				+#             text_box_list2 = []
			
 
				+#             table_list2 = []
			
 
				+#
			
 
				+#         if _table_list and _text_box_list:
			
 
				+#             text_box_list2 += _text_box_list
			
 
				+#             text_box_list2 = list(set(text_box_list2))
			
 
				+#             # table_list2 += _table_list
			
 
				+#             # obj_in_table_list2 = obj_in_table_list2.union(_obj_in_table_list)
			
 
				+#         return text_box_list2, _table_list, _obj_in_table_list
			
 
				+#
			
 
				+#     def table_process(self, list_line, list_text_boxes, _image_np):
			
 
				+#         # 调用现成方法形成表格
			
 
				+#         try:
			
 
				+#             if list_line:
			
 
				+#
			
 
				+#                 # 排除掉短且经过文字bbox中间的竖线
			
 
				+#                 temp_list = []
			
 
				+#                 for line in list_line:
			
 
				+#                     find_cnt = 0
			
 
				+#                     if abs(line[0]-line[2]) < abs(line[1]-line[3]) and abs(line[1] - line[3]) <= _image_np.shape[0] / 20:
			
 
				+#                         for t_obj in list_text_boxes:
			
 
				+#                             # if not (t_obj.bbox[1] <= line[1] <= t_obj.bbox[3] or t_obj.bbox[1] <= line[3] <= t_obj.bbox[3]):
			
 
				+#                             #     continue
			
 
				+#                             if line_iou([[t_obj.bbox[1], 0], [t_obj.bbox[3], 0]], [[line[1], 0], [line[3], 0]]) < 0.3:
			
 
				+#                                 continue
			
 
				+#                             if abs(t_obj.bbox[0]-t_obj.bbox[2])/5 + min(t_obj.bbox[0], t_obj.bbox[2]) <= line[0] <= abs(t_obj.bbox[0]-t_obj.bbox[2])/5*4 + min(t_obj.bbox[0], t_obj.bbox[2]) and (t_obj.bbox[0]-t_obj.bbox[2]) <= 60:
			
 
				+#                                 # print('match', line[0], t_obj.bbox[0], t_obj.bbox[2], t_obj.get_text())
			
 
				+#                                 find_cnt += 1
			
 
				+#                                 if find_cnt >= 2:
			
 
				+#                                     break
			
 
				+#                     if find_cnt >= 2:
			
 
				+#                         continue
			
 
				+#                     temp_list.append(line)
			
 
				+#                 list_line = temp_list
			
 
				+#
			
 
				+#                 from format_convert.convert_tree import TableLine
			
 
				+#                 list_lines = []
			
 
				+#                 for line in list_line:
			
 
				+#                     list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
			
 
				+#
			
 
				+#                 lt = LineTable()
			
 
				+#                 tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
			
 
				+#                                                                                    sourceP_LB=False, splited=False,
			
 
				+#                                                                                    from_pdf=self.is_from_pdf,
			
 
				+#                                                                                    is_reverse=self.is_reverse)
			
 
				+#                 # 需分割textbox
			
 
				+#                 if connect_textbox_list:
			
 
				+#                     list_text_boxes = self.table_textbox_split(_image_np, connect_textbox_list, list_text_boxes)
			
 
				+#                     # 新的textbox，重新做表格
			
 
				+#                     tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
			
 
				+#                                                                                        sourceP_LB=False, splited=True,
			
 
				+#                                                                                        from_pdf=self.is_from_pdf,
			
 
				+#                                                                                        is_reverse=self.is_reverse)
			
 
				+#
			
 
				+#                 if not tables:
			
 
				+#                     return list_text_boxes, tables, obj_in_table
			
 
				+#                 return list_text_boxes, tables, obj_in_table
			
 
				+#             else:
			
 
				+#                 return list_text_boxes, [], set()
			
 
				+#         except:
			
 
				+#             traceback.print_exc()
			
 
				+#             return [-8], [-8], [-8]
			
 
				+#
			
 
				+#     def slice_process(self, _image_np):
			
 
				+#         slice_flag = need_image_slice(_image_np)
			
 
				+#         log("need_image_slice " + str(slice_flag) + " " + str(_image_np.shape))
			
 
				+#         _image_np_list = [_image_np]
			
 
				+#         if slice_flag:
			
 
				+#             # 长图分割
			
 
				+#             _image_np_list = image_slice_new(_image_np)
			
 
				+#             angle_dict = {}
			
 
				+#             for im in _image_np_list:
			
 
				+#                 _, angle = self.idc_process(im, return_angle=True)
			
 
				+#                 if angle in [0, 360]:
			
 
				+#                     angle = 0
			
 
				+#                 if angle in angle_dict.keys():
			
 
				+#                     angle_dict[angle] += 1
			
 
				+#                 else:
			
 
				+#                     angle_dict[angle] = 1
			
 
				+#
			
 
				+#             # idc不太准，有0度就直接使用
			
 
				+#             if 0 in angle_dict.keys():
			
 
				+#                 log('image_slice 0 in angle_dict')
			
 
				+#                 angle = 0
			
 
				+#             else:
			
 
				+#                 angle_list = [[key, value] for key, value in angle_dict.items()]
			
 
				+#                 angle_list.sort(key=lambda x: x[1])
			
 
				+#                 log('image_slice angle_list ' + str(angle_list))
			
 
				+#                 angle = angle_list[-1][0]
			
 
				+#             for i in range(len(_image_np_list)):
			
 
				+#                 _image_np_list[i] = image_rotate(_image_np_list[i], angle)
			
 
				+#             if angle in [180]:
			
 
				+#                 _image_np_list.reverse()
			
 
				+#
			
 
				+#         if len(_image_np_list) < 1:
			
 
				+#             log("image_slice failed!")
			
 
				+#             _image_np_list = [_image_np]
			
 
				+#         return _image_np_list
			
 
				+#
			
 
				+#     def get_text_box_obj(self, _text_list, _bbox_list):
			
 
				+#         from format_convert.convert_tree import TextBox
			
 
				+#         _text_box_list = []
			
 
				+#         for i in range(len(_bbox_list)):
			
 
				+#             bbox = _bbox_list[i]
			
 
				+#             b_text = _text_list[i]
			
 
				+#             _text_box_list.append(TextBox([bbox[0][0], bbox[0][1],
			
 
				+#                                            bbox[2][0], bbox[2][1]], b_text))
			
 
				+#         return _text_box_list
			
 
				+#
			
 
				+#     def save_b_table(self, image_np2, text_box_list2, from_pdf=False):
			
 
				+#         _start_time = time.time()
			
 
				+#         _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table'
			
 
				+#         # _path = 'D:/Project/format_conversion_maxcompute/save_b_table'
			
 
				+#         max_index = 20000
			
 
				+#         if os.path.exists(_path):
			
 
				+#             file_list = glob(_path + '/*')
			
 
				+#             if file_list:
			
 
				+#                 file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
			
 
				+#                 file_index_list.sort(key=lambda x: x)
			
 
				+#                 index = file_index_list[-1] + 1
			
 
				+#             else:
			
 
				+#                 index = 0
			
 
				+#             if index > max_index:
			
 
				+#                 return
			
 
				+#
			
 
				+#             # 文件md5
			
 
				+#             from format_convert import _global
			
 
				+#             _md5 = _global.get("md5")
			
 
				+#
			
 
				+#             _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png'
			
 
				+#             cv2.imwrite(_image_path, image_np2)
			
 
				+#             log('save b_table image success!')
			
 
				+#
			
 
				+#             # if from_pdf:
			
 
				+#             #     _file_path = _path + '/' + str(_md5) + '-' + str(index) + '.txt'
			
 
				+#             #     new_text_box_list2 = [str(x) + '\n' for x in text_box_list2]
			
 
				+#             #     with open(_file_path, 'w') as f:
			
 
				+#             #         f.writelines(new_text_box_list2)
			
 
				+#             #     log('save b_table txt success!')
			
 
				+#
			
 
				+#         log('save_b_table cost: ' + str(time.time()-_start_time))
			
 
				+#
			
 
				+#     def table_textbox_split(self, image_np2, connect_textbox_list, textbox_list):
			
 
				+#         """
			
 
				+#         两个单元格里的文本被ocr识别为一个，需分开才能准确放进表格
			
 
				+#
			
 
				+#         :return:
			
 
				+#         """
			
 
				+#         split_bbox_list = []
			
 
				+#         split_text_list = []
			
 
				+#         splited_textbox_list = []
			
 
				+#         for textbox in connect_textbox_list:
			
 
				+#             bbox = textbox.bbox
			
 
				+#             bbox = [[bbox[0], bbox[1]], [], [bbox[2], bbox[3]], []]
			
 
				+#             sub_image_np = image_np2[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
			
 
				+#             split_index_list = []
			
 
				+#             # 从左到右遍历img
			
 
				+#             for i in range(5, sub_image_np.shape[1]-5):
			
 
				+#                 # 找表格分割线，这一列都为黑色像素
			
 
				+#                 if np.where(sub_image_np[:, i, 0] < 200)[0].size >= sub_image_np.shape[0]:
			
 
				+#                     split_index_list.append(i)
			
 
				+#
			
 
				+#             # 判断两线之间宽度，去重
			
 
				+#             if len(split_index_list) > 1:
			
 
				+#                 last_index = split_index_list[0]
			
 
				+#                 temp_list = []
			
 
				+#                 delete_list = []
			
 
				+#                 for index in split_index_list[1:]:
			
 
				+#                     if index in delete_list:
			
 
				+#                         continue
			
 
				+#                     if index - last_index <= 5:
			
 
				+#                         delete_list.append(index)
			
 
				+#                     else:
			
 
				+#                         last_index = index
			
 
				+#                     temp_list.append(last_index)
			
 
				+#                 split_index_list = temp_list
			
 
				+#
			
 
				+#             # n条以上分割线，有问题
			
 
				+#             if len(split_index_list) == 0 or len(split_index_list) >= 2:
			
 
				+#                 # print('len(split_index_list)', len(split_index_list), split_index_list)
			
 
				+#                 continue
			
 
				+#             else:
			
 
				+#                 # 根据index拆开图片，重新ocr
			
 
				+#                 split_index_list.insert(0, 0)
			
 
				+#                 print('split_index_list1', split_index_list)
			
 
				+#                 for _i, index in enumerate(split_index_list):
			
 
				+#                     if _i == len(split_index_list) - 1:
			
 
				+#                         split_image_np = sub_image_np[:, index:, :]
			
 
				+#                         split_bbox_list.append([[bbox[0][0]+index, bbox[0][1]], [], [bbox[2][0], bbox[2][1]], []])
			
 
				+#                     else:
			
 
				+#                         next_index = split_index_list[_i+1]
			
 
				+#                         split_image_np = sub_image_np[:, index:next_index, :]
			
 
				+#                         split_bbox_list.append([[bbox[0][0]+index, bbox[0][1]], [], [bbox[0][0]+next_index, bbox[2][1]], []])
			
 
				+#
			
 
				+#                     # ocr
			
 
				+#                     split_image_bytes = np2bytes(split_image_np)
			
 
				+#                     text_list2, bbox_list2 = from_ocr_interface(split_image_bytes, is_table=1, only_rec=1)
			
 
				+#                     # print('text_list2', text_list2)
			
 
				+#                     # print('bbox_list2', split_bbox_list)
			
 
				+#                     if judge_error_code(text_list2):
			
 
				+#                         text2 = ''
			
 
				+#                     else:
			
 
				+#                         if text_list2:
			
 
				+#                             text2 = text_list2[0]
			
 
				+#                         else:
			
 
				+#                             text2 = ''
			
 
				+#                     split_text_list.append(text2)
			
 
				+#                 splited_textbox_list.append(textbox)
			
 
				+#
			
 
				+#         if split_text_list and split_bbox_list:
			
 
				+#             split_textbox_list = self.get_text_box_obj(split_text_list, split_bbox_list)
			
 
				+#             for tb in splited_textbox_list:
			
 
				+#                 if tb in textbox_list:
			
 
				+#                     textbox_list.remove(tb)
			
 
				+#             textbox_list += split_textbox_list
			
 
				+#
			
 
				+#         return textbox_list
			
 
				+#
			
 
				+#     def __call__(self):
			
 
				+#         from format_convert.convert_tree import _Table, _Sentence
			
 
				+#         log("into image_preprocess")
			
 
				+#         try:
			
 
				+#             if self.image_np is None:
			
 
				+#                 log("image_preprocess image_np is None")
			
 
				+#                 return []
			
 
				+#             if self.image_np.shape[0] <= 20 or self.image_np.shape[1] <= 20:
			
 
				+#                 log('image_np.shape[0] <= 20 or image_np.shape[1] <= 20')
			
 
				+#                 return []
			
 
				+#
			
 
				+#             if not self.b_table_from_text:
			
 
				+#                 # 判断是否需要长图分割
			
 
				+#                 idc_flag = False
			
 
				+#                 image_np_list = self.slice_process(self.image_np)
			
 
				+#                 if len(image_np_list) > 1:
			
 
				+#                     idc_flag = True
			
 
				+#
			
 
				+#                 reverse_flag = 0
			
 
				+#                 table_textbox_list = []
			
 
				+#                 for image_np in image_np_list:
			
 
				+#                     # 整体分辨率限制
			
 
				+#                     image_np = self.resize_process(image_np)
			
 
				+#
			
 
				+#                     # 印章去除
			
 
				+#                     image_np = self.isr_process(image_np)
			
 
				+#                     if isinstance(image_np, list):
			
 
				+#                         return image_np
			
 
				+#
			
 
				+#                     # 文字识别
			
 
				+#                     text_list, box_list = self.ocr_process(image_np)
			
 
				+#                     if judge_error_code(text_list):
			
 
				+#                         return text_list
			
 
				+#
			
 
				+#                     # 判断ocr识别是否正确
			
 
				+#                     # print('ocr_cant_read(text_list, box_list)', ocr_cant_read(text_list, box_list), idc_flag, text_list)
			
 
				+#                     if ocr_cant_read(text_list, box_list) and not idc_flag:
			
 
				+#                         # 方向分类
			
 
				+#                         image_np, angle = self.idc_process(image_np, return_angle=True)
			
 
				+#                         if isinstance(image_np, list):
			
 
				+#                             return image_np
			
 
				+#                         # 如果角度不变，旋转180
			
 
				+#                         if angle in [0, 360]:
			
 
				+#                             pass
			
 
				+#                             # log('ocr_cant_read image_rotate 180')
			
 
				+#                             # image_np = image_rotate(image_np, angle=180)
			
 
				+#                             # reverse_flag = 1
			
 
				+#                             # image_pil = Image.fromarray(image_np)
			
 
				+#                             # image_np = np.array(image_pil.rotate(180, expand=1))
			
 
				+#                         # cv2.imshow("idc_process", image_np)
			
 
				+#                         # cv2.waitKey(0)
			
 
				+#
			
 
				+#                         # 文字识别
			
 
				+#                         text_list1, box_list_1 = self.ocr_process(image_np)
			
 
				+#                         if judge_error_code(text_list1):
			
 
				+#                             return text_list1
			
 
				+#
			
 
				+#                         if len(text_list1) > 0 and ocr_cant_read(text_list1, box_list_1) and self.is_from_pdf:
			
 
				+#                             return [-16]
			
 
				+#
			
 
				+#                         # 比较字数
			
 
				+#                         # print("ocr process", len("".join(text_list)), len("".join(text_list1)))
			
 
				+#                         if len("".join(text_list)) < len("".join(text_list1)):
			
 
				+#                             text_list = text_list1
			
 
				+#                             box_list = box_list_1
			
 
				+#
			
 
				+#                     # 表格识别
			
 
				+#                     line_list = self.otr_process(image_np)
			
 
				+#                     if judge_error_code(line_list):
			
 
				+#                         return line_list
			
 
				+#
			
 
				+#                     # 生成TextBox对象
			
 
				+#                     text_box_list = self.get_text_box_obj(text_list, box_list)
			
 
				+#                     # for t in text_box_list:
			
 
				+#                     #     print('text_box0', t.get_text())
			
 
				+#
			
 
				+#                     # 表格生成
			
 
				+#                     text_box_list, table_list, obj_in_table_list = self.table_process(line_list, text_box_list, image_np)
			
 
				+#                     # for t in text_box_list:
			
 
				+#                     #     print('text_box1', t.get_text())
			
 
				+#                     # print('table_list', table_list)
			
 
				+#                     # for t in obj_in_table_list:
			
 
				+#                     #     print('obj_text_box2', t.get_text())
			
 
				+#                     if judge_error_code(table_list):
			
 
				+#                         return table_list
			
 
				+#
			
 
				+#                     # 无边框表格识别
			
 
				+#                     start_time = time.time()
			
 
				+#                     text_box_list, b_table_list, b_obj_in_table_list \
			
 
				+#                         = self.botr_process(image_np, table_list, text_list, box_list,
			
 
				+#                                             text_box_list, obj_in_table_list, self.b_table_from_text,
			
 
				+#                                             self.pdf_obj_list, self.pdf_layout_size,
			
 
				+#                                             )
			
 
				+#                     log('botr process cost: ' + str(time.time()-start_time))
			
 
				+#
			
 
				+#                     # 合并非表格的同一行TextBox
			
 
				+#                     text_box_list = self.merge_textbox(text_box_list, obj_in_table_list)
			
 
				+#
			
 
				+#                     table_textbox_list.append([table_list, b_table_list, obj_in_table_list, text_box_list])
			
 
				+#
			
 
				+#                 if reverse_flag:
			
 
				+#                     table_textbox_list.reverse()
			
 
				+#
			
 
				+#                     for i in range(len(image_np_list)):
			
 
				+#                         image_np_list[i] = image_rotate(image_np_list[i], angle=180)
			
 
				+#                     image_np_list.reverse()
			
 
				+#
			
 
				+#                 # index = 0
			
 
				+#                 # for image_np in image_np_list:
			
 
				+#                 #     cv2.imshow(str(index) + '.jpg', image_np)
			
 
				+#                 #     cv2.waitKey(0)
			
 
				+#                 #     index += 1
			
 
				+#
			
 
				+#                 # 对象生成
			
 
				+#                 all_obj_list = []
			
 
				+#                 _add_y = 0
			
 
				+#                 for table_list, b_table_list, obj_in_table_list, text_box_list in table_textbox_list:
			
 
				+#                     obj_list = []
			
 
				+#                     for table in table_list:
			
 
				+#                         _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
			
 
				+#                         _table = _Table(table["table"], _table_bbox)
			
 
				+#                         obj_list.append(_table)
			
 
				+#                     for table in b_table_list:
			
 
				+#                         _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
			
 
				+#                         _table = _Table(table["table"], _table_bbox)
			
 
				+#                         obj_list.append(_table)
			
 
				+#                     for text_box in text_box_list:
			
 
				+#                         if text_box not in obj_in_table_list:
			
 
				+#                             text_box.bbox[1] += _add_y
			
 
				+#                             obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
			
 
				+#
			
 
				+#                     # 多图修正y
			
 
				+#                     if len(image_np_list) > 1:
			
 
				+#                         list_y = []
			
 
				+#                         for obj in obj_list:
			
 
				+#                             obj.y += _add_y
			
 
				+#                             list_y.append(obj.y)
			
 
				+#                         if len(list_y) > 0:
			
 
				+#                             _add_y += max(list_y)
			
 
				+#
			
 
				+#                     # 合并
			
 
				+#                     all_obj_list += obj_list
			
 
				+#
			
 
				+#             # 无边框表格图片
			
 
				+#             else:
			
 
				+#                 all_obj_list = []
			
 
				+#                 table_list = []
			
 
				+#                 text_list = []
			
 
				+#                 box_list = []
			
 
				+#                 text_box_list = []
			
 
				+#                 obj_in_table_list = set()
			
 
				+#
			
 
				+#                 # 表格识别
			
 
				+#                 line_list = self.otr_process(self.image_np)
			
 
				+#                 if judge_error_code(line_list):
			
 
				+#                     return line_list
			
 
				+#
			
 
				+#                 # 生成TextBox对象
			
 
				+#                 text_box_list = self.get_text_box_obj(text_list, box_list)
			
 
				+#
			
 
				+#                 # 表格生成
			
 
				+#                 text_box_list, table_list, obj_in_table_list = self.table_process(line_list, text_box_list, self.image_np)
			
 
				+#                 if judge_error_code(table_list):
			
 
				+#                     return table_list
			
 
				+#
			
 
				+#                 # 无边框表格识别
			
 
				+#                 start_time = time.time()
			
 
				+#                 text_box_list, table_list, obj_in_table_list \
			
 
				+#                     = self.botr_process(self.image_np, table_list,
			
 
				+#                                         text_list, box_list,
			
 
				+#                                                                             text_box_list,
			
 
				+#                                                                             obj_in_table_list,
			
 
				+#                                         self.b_table_from_text,
			
 
				+#                                         self.pdf_obj_list,
			
 
				+#                                         self.pdf_layout_size,
			
 
				+#                                                                             )
			
 
				+#                 log('botr process cost: ' + str(time.time()-start_time))
			
 
				+#
			
 
				+#                 # 合并非表格的同一行TextBox
			
 
				+#                 text_box_list = self.merge_textbox(text_box_list, obj_in_table_list)
			
 
				+#
			
 
				+#                 # 对象生成
			
 
				+#                 obj_list = []
			
 
				+#                 # print('table_list', table_list)
			
 
				+#                 for table in table_list:
			
 
				+#                     _table = _Table(table["table"], table["bbox"])
			
 
				+#                     obj_list.append(_table)
			
 
				+#                 for text_box in text_box_list:
			
 
				+#                     if text_box not in obj_in_table_list:
			
 
				+#                         obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
			
 
				+#
			
 
				+#                 # 合并
			
 
				+#                 all_obj_list += obj_list
			
 
				+#
			
 
				+#             return all_obj_list
			
 
				+#
			
 
				+#         except Exception as e:
			
 
				+#             log("image_preprocess error")
			
 
				+#             traceback.print_exc()
			
 
				+#             return [-1]
			
 
				+
			
 
				+
			
 
				 @memory_decorator
			
 
				 def picture2text(path, html=False):
			
 
				     log("into picture2text")
			
@@ -786,6 +1554,21 @@ def get_best_predict_size2(image_np, threshold=3000):
 
				     return h, w
			
 
				 
			
 
				 
			
 
				+def get_best_predict_size_by_area(image_np, threshold=1280):
			
 
				+    max_area = threshold*threshold
			
 
				+    height, width = image_np.shape[:2]
			
 
				+    area = height * width
			
 
				+
			
 
				+    if area <= max_area:
			
 
				+        return height, width
			
 
				+
			
 
				+    # 计算缩放比例
			
 
				+    scale = (max_area / area) ** 0.5
			
 
				+    new_width = int(width * scale)
			
 
				+    new_height = int(height * scale)
			
 
				+    return new_height, new_width
			
 
				+
			
 
				+
			
 
				 def image_slice(image_np):
			
 
				     """
			
 
				     slice the image if the height is to large
			
@@ -1269,6 +2052,17 @@ def image_process_old(image_np, image_path, is_from_pdf=False, is_from_docx=Fals
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    img111 = cv2.imread("C:/Users/Administrator/Downloads/1724146601927.png")
			
 
				-    cv2.imshow('111', img111)
			
 
				-    cv2.waitKey(0)
			
 
				+    # _pp = r'D:\Project\format_conversion_maxcompute\save_b_table' \
			
 
				+    #       r'\211-6591070e1cc8ea6904ba00a0a3d6c32f.png'
			
 
				+    _pp = r'C:\Users\Administrator\Desktop\test_b_table\error7.png'
			
 
				+    save_pp = r'D:\Project\format_conversion_maxcompute\format_convert\temp\test_convert_image.jpg'
			
 
				+    # img111 = cv2.imread(_pp)
			
 
				+    # img111 = pil_resize(img111, 1024, 768)
			
 
				+    # cv2.imwrite(save_pp, img111)
			
 
				+    # image_process(img111, '')
			
 
				+    # cv2.imshow('111', img111)
			
 
				+    # cv2.waitKey(0)
			
 
				+
			
 
				+    _html = ImageConvert(_pp, r"D:\Project\format_conversion_maxcompute\format_convert\temp").get_html()
			
 
				+    with open('../result.html', 'w', encoding='utf-8') as f:
			
 
				+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])
			
--- a/format_convert/convert_need_interface.py
+++ b/format_convert/convert_need_interface.py
@@ -144,6 +144,7 @@ def from_office_interface_240606(src_path, dest_path, target_format, retry_times
 
				 
			
 
				 
			
 
				 def from_office_interface(src_path, dest_path, target_format, retry_times=1, from_remote=FROM_REMOTE):
			
 
				+    start_time = time.time()
			
 
				     try:
			
 
				         if from_remote:
			
 
				             # 重试
			
@@ -200,6 +201,8 @@ def from_office_interface(src_path, dest_path, target_format, retry_times=1, fro
 
				         log("from_office_interface error!")
			
 
				         traceback.print_exc()
			
 
				         return [-1]
			
 
				+    finally:
			
 
				+        log("from_office_interface cost time " + str(time.time()-start_time))
			
 
				 
			
 
				 
			
 
				 def from_tika_interface(src_path, from_remote=FROM_REMOTE):
			
@@ -239,17 +242,21 @@ def from_tika_interface(src_path, from_remote=FROM_REMOTE):
 
				             return [-2]
			
 
				 
			
 
				         _dict = r
			
 
				-        html = _dict.get("html")
			
 
				-        log("from_tika_interface cost time " + str(time.time()-start_time))
			
 
				-        return html
			
 
				+        data = _dict.get("data")
			
 
				+
			
 
				+        return data
			
 
				     except Exception as e:
			
 
				         log("from_tika_interface error!")
			
 
				         traceback.print_exc()
			
 
				         return [-11]
			
 
				+    finally:
			
 
				+        log("from_tika_interface cost time " + str(time.time()-start_time))
			
 
				 
			
 
				 
			
 
				 def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_REMOTE):
			
 
				     log("into from_ocr_interface")
			
 
				+    # print('FROM_REMOTE', FROM_REMOTE)
			
 
				+    start_time = time.time()
			
 
				     try:
			
 
				         base64_stream = base64.b64encode(image_stream)
			
 
				 
			
@@ -281,7 +288,10 @@ def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_RE
 
				                             log("retry post ocr_interface... left times " + str(retry_times_1))
			
 
				                             continue
			
 
				                     if judge_error_code(r):
			
 
				-                        return r
			
 
				+                        if is_table:
			
 
				+                            return r, r
			
 
				+                        else:
			
 
				+                            return r
			
 
				                     break
			
 
				             else:
			
 
				                 if globals().get("global_ocr_model") is None:
			
@@ -326,6 +336,8 @@ def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_RE
 
				             return [-1], [-1]
			
 
				         else:
			
 
				             return [-1]
			
 
				+    finally:
			
 
				+        log("from_ocr_interface cost time " + str(time.time()-start_time))
			
 
				 
			
 
				 
			
 
				 def from_gpu_interface_redis(_dict, model_type, predictor_type):
			
@@ -366,6 +378,7 @@ def from_gpu_interface_redis(_dict, model_type, predictor_type):
 
				 
			
 
				 def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE):
			
 
				     log("into from_otr_interface")
			
 
				+    start_time = time.time()
			
 
				     try:
			
 
				         base64_stream = base64.b64encode(image_stream)
			
 
				 
			
@@ -424,6 +437,8 @@ def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE)
 
				         log("from_otr_interface error!")
			
 
				         print("from_otr_interface", traceback.print_exc())
			
 
				         return [-1]
			
 
				+    finally:
			
 
				+        log("from_otr_interface cost time " + str(time.time()-start_time))
			
 
				 
			
 
				 
			
 
				 def from_isr_interface(image_stream, from_remote=FROM_REMOTE):
			
@@ -487,7 +502,6 @@ def from_isr_interface(image_stream, from_remote=FROM_REMOTE):
 
				             image_np = cv2.imdecode(buffer, 1)
			
 
				         else:
			
 
				             image_np = _dict.get("image")
			
 
				-        log("from_isr_interface cost time " + str(time.time()-start_time))
			
 
				         return image_np
			
 
				     except Exception as e:
			
 
				         log("from_isr_interface error!")
			
@@ -495,7 +509,7 @@ def from_isr_interface(image_stream, from_remote=FROM_REMOTE):
 
				         return [-11]
			
 
				     finally:
			
 
				         # os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
			
 
				-        pass
			
 
				+        log("from_isr_interface cost time " + str(time.time()-start_time))
			
 
				 
			
 
				 
			
 
				 def from_idc_interface(image_stream, from_remote=FROM_REMOTE):
			
@@ -543,12 +557,13 @@ def from_idc_interface(image_stream, from_remote=FROM_REMOTE):
 
				 
			
 
				         _dict = r
			
 
				         angle = _dict.get("angle")
			
 
				-        log("from_idc_interface cost time " + str(time.time()-start_time))
			
 
				         return angle
			
 
				     except Exception as e:
			
 
				         log("from_idc_interface error!")
			
 
				         traceback.print_exc()
			
 
				         return [-11]
			
 
				+    finally:
			
 
				+        log("from_idc_interface cost time " + str(time.time()-start_time))
			
 
				 
			
 
				 
			
 
				 def from_atc_interface(text, from_remote=FROM_REMOTE):
			
@@ -594,12 +609,13 @@ def from_atc_interface(text, from_remote=FROM_REMOTE):
 
				 
			
 
				         _dict = r
			
 
				         classification = _dict.get("classification")
			
 
				-        log("from_atc_interface cost time " + str(time.time()-start_time))
			
 
				         return classification
			
 
				     except Exception as e:
			
 
				         log("from_atc_interface error!")
			
 
				         traceback.print_exc()
			
 
				         return [-11]
			
 
				+    finally:
			
 
				+        log("from_atc_interface cost time " + str(time.time()-start_time))
			
 
				 
			
 
				 
			
 
				 def from_yolo_interface(image_stream, from_remote=FROM_REMOTE):
			
@@ -652,12 +668,13 @@ def from_yolo_interface(image_stream, from_remote=FROM_REMOTE):
 
				 
			
 
				         _dict = r
			
 
				         b_table_list = _dict.get("b_table_list")
			
 
				-        log("from_yolo_interface cost time " + str(time.time()-start_time))
			
 
				         return b_table_list
			
 
				     except Exception as e:
			
 
				         log("from_yolo_interface error!")
			
 
				         traceback.print_exc()
			
 
				         return [-11]
			
 
				+    finally:
			
 
				+        log("from_yolo_interface cost time " + str(time.time()-start_time))
			
 
				 
			
 
				 
			
 
				 def interface_pool_gunicorn(interface_type):
			
--- a/format_convert/convert_ofd.py
+++ b/format_convert/convert_ofd.py
@@ -0,0 +1,75 @@
 
				+import base64
			
 
				+import os
			
 
				+import re
			
 
				+import sys
			
 
				+import time
			
 
				+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
			
 
				+from format_convert.easyofd.easyofd.ofd import OFD
			
 
				+from format_convert.convert_tree import _Document, _Sentence, _Page
			
 
				+import logging
			
 
				+import traceback
			
 
				+from format_convert.convert_pdf import PDFConvert
			
 
				+from format_convert.utils import judge_error_code, get_logger, log
			
 
				+
			
 
				+
			
 
				+class OfdConvert:
			
 
				+    def __init__(self, path, unique_type_dir):
			
 
				+        self._doc = _Document(path)
			
 
				+        self.path = path
			
 
				+        self.unique_type_dir = unique_type_dir
			
 
				+        self.ofd = OFD()  # 初始化OFD 工具类
			
 
				+
			
 
				+    def convert(self):
			
 
				+        start_time = time.time()
			
 
				+        file_prefix = os.path.splitext(os.path.split(self.path)[1])[0]
			
 
				+
			
 
				+        with open(self.path, "rb") as f:
			
 
				+            ofd_b64 = str(base64.b64encode(f.read()), "utf-8")
			
 
				+
			
 
				+        self.ofd.read(ofd_b64, save_xml=False, xml_name=f"{file_prefix}_xml",
			
 
				+                      save_dir=self.unique_type_dir)  # 读取ofdb64
			
 
				+        # print("ofd.data", ofd.data) # ofd.data 为程序解析结果
			
 
				+        pdf_bytes, page_need_to_image_dict = self.ofd.to_pdf(return_need_convert_as_image=True)  # 转pdf
			
 
				+        log('ofd to pdf cost: ' + str(time.time()-start_time))
			
 
				+        # print('page_need_to_image_dict', page_need_to_image_dict)
			
 
				+
			
 
				+        self.ofd.del_data()
			
 
				+
			
 
				+        file_name = re.split('[/\\\]', self.path)[-1]
			
 
				+        new_path = self.unique_type_dir + file_name[:-4] + '.pdf'
			
 
				+
			
 
				+        with open(new_path, "wb") as f:
			
 
				+            f.write(pdf_bytes)
			
 
				+        log('odf to pdf path ' + new_path + ' cost: ' + str(time.time()-start_time))
			
 
				+
			
 
				+        # 用pdf提取
			
 
				+        self._pdf = PDFConvert(new_path, self.unique_type_dir, need_page_no=None,
			
 
				+                               page_need_to_image_dict=page_need_to_image_dict)
			
 
				+        # self._pdf.convert()
			
 
				+        # self._doc = self._pdf._doc
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        try:
			
 
				+            self.convert()
			
 
				+        except:
			
 
				+            traceback.print_exc()
			
 
				+            self._doc.error_code = [-1]
			
 
				+
			
 
				+        # 直接返回pdf处理的html
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return self._doc.error_code
			
 
				+        else:
			
 
				+            return self._pdf.get_html()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    _p = "C:/Users/Administrator/Downloads/0c71fe77-f052-414d-8189-3e8cb4f2a607.ofd"
			
 
				+    p = '../1750060386706.ofd'
			
 
				+    # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
			
 
				+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp\2" + '/'
			
 
				+    c = OfdConvert(_p, save_dir)
			
 
				+    _html = c.get_html()
			
 
				+    with open('../result.html', 'w', encoding='utf-8') as f:
			
 
				+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])
			
 
				+
			
 
				+
			
--- a/format_convert/convert_ofd_test.py
+++ b/format_convert/convert_ofd_test.py
@@ -0,0 +1,75 @@
 
				+import base64
			
 
				+import os
			
 
				+import re
			
 
				+import sys
			
 
				+import time
			
 
				+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
			
 
				+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
			
 
				+
			
 
				+from format_convert.utils import judge_error_code, get_logger, log, register_all_fonts
			
 
				+# register_all_fonts("/usr/share/fonts/")
			
 
				+
			
 
				+from format_convert.easyofd.easyofd.ofd import OFD
			
 
				+from format_convert.convert_tree import _Document, _Sentence, _Page
			
 
				+import logging
			
 
				+import traceback
			
 
				+from format_convert.convert_pdf import PDFConvert
			
 
				+
			
 
				+
			
 
				+class OfdConvert:
			
 
				+    def __init__(self, path, unique_type_dir):
			
 
				+        self._doc = _Document(path)
			
 
				+        self.path = path
			
 
				+        self.unique_type_dir = unique_type_dir
			
 
				+        self.ofd = OFD()  # 初始化OFD 工具类
			
 
				+
			
 
				+    def convert(self):
			
 
				+        start_time = time.time()
			
 
				+        file_prefix = os.path.splitext(os.path.split(self.path)[1])[0]
			
 
				+
			
 
				+        with open(self.path, "rb") as f:
			
 
				+            ofd_b64 = str(base64.b64encode(f.read()), "utf-8")
			
 
				+
			
 
				+        self.ofd.read(ofd_b64, save_xml=False, xml_name=f"{file_prefix}_xml",
			
 
				+                      save_dir=self.unique_type_dir)  # 读取ofdb64
			
 
				+        # print("ofd.data", ofd.data) # ofd.data 为程序解析结果
			
 
				+        pdf_bytes = self.ofd.to_pdf()  # 转pdf
			
 
				+
			
 
				+        self.ofd.del_data()
			
 
				+
			
 
				+        file_name = re.split('[/\\\]', self.path)[-1]
			
 
				+        new_path = self.unique_type_dir + file_name[:-4] + '.pdf'
			
 
				+
			
 
				+        with open(new_path, "wb") as f:
			
 
				+            f.write(pdf_bytes)
			
 
				+        log('odf to pdf path ' + new_path + ' cost: ' + str(time.time()-start_time))
			
 
				+
			
 
				+        # 用pdf提取
			
 
				+        self._pdf = PDFConvert(new_path, self.unique_type_dir, need_page_no=None)
			
 
				+        # _pdf.convert()
			
 
				+        # self._doc = _pdf._doc
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        try:
			
 
				+            self.convert()
			
 
				+        except:
			
 
				+            traceback.print_exc()
			
 
				+            self._doc.error_code = [-1]
			
 
				+
			
 
				+        # 直接返回doc处理的html
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return self._doc.error_code
			
 
				+        else:
			
 
				+            return self._pdf.get_html()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    _p = "C:/Users/Administrator/Downloads/0c71fe77-f052-414d-8189-3e8cb4f2a607.ofd"
			
 
				+    _p = '../1750381792388.ofd'
			
 
				+    # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
			
 
				+    save_dir = "/data/fangjiasheng/format_conversion_maxcompute/format_convert/temp" + '/'
			
 
				+    c = OfdConvert(_p, save_dir)
			
 
				+    _html = c.get_html()
			
 
				+    print(_html)
			
 
				+
			
 
				+
			
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -1,3 +1,6 @@
 
				+import shutil
			
 
				+import zlib
			
 
				+from glob import glob
			
 
				 import copy
			
 
				 import io
			
 
				 import os
			
@@ -23,10 +26,12 @@ from pdfminer.converter import PDFPageAggregator
 
				 from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
			
 
				     LTTextBoxVertical, LTLine, LTTextContainer, LTTextLine
			
 
				 from format_convert.utils import judge_error_code, get_platform, LineTable, log, \
			
 
				-    memory_decorator, get_garble_code, get_md5_from_bytes, bytes2np, bbox_iou, get_garble_code2, get_traditional_chinese
			
 
				+    memory_decorator, get_garble_code, get_md5_from_bytes, bytes2np, bbox_iou, get_garble_code2, \
			
 
				+    get_traditional_chinese, ascii85_decode
			
 
				 import fitz
			
 
				 from format_convert.wrapt_timeout_decorator import timeout
			
 
				 from otr.table_line_pdf import table_line_pdf
			
 
				+from botr.extract_table import get_b_table_by_blank_colon
			
 
				 
			
 
				 
			
 
				 @memory_decorator
			
@@ -38,6 +43,7 @@ def pdf2text(path, unique_type_dir):
 
				 def pdf_analyze(interpreter, page, device, page_no):
			
 
				     pdf_time = time.time()
			
 
				     interpreter.process_page(page)
			
 
				+    # print('interpreter.process_page time', time.time()-pdf_time)
			
 
				     layout = device.get_result()
			
 
				     log("page_no: " + str(page_no) + " pdf_analyze cost: " + str(time.time() - pdf_time))
			
 
				     return layout
			
@@ -76,7 +82,7 @@ def read_pdfplumber(path, laparams):
 
				 
			
 
				 
			
 
				 class PDFConvert:
			
 
				-    def __init__(self, path, unique_type_dir, need_page_no):
			
 
				+    def __init__(self, path, unique_type_dir, need_page_no, page_need_to_image_dict=None):
			
 
				         self._doc = _Document(path)
			
 
				         self.path = path
			
 
				         self.unique_type_dir = unique_type_dir
			
@@ -89,7 +95,7 @@ class PDFConvert:
 
				         self.end_page_no = None
			
 
				         # 默认使用limit_page_cnt控制，前10页后10页
			
 
				         if self.need_page_no is None:
			
 
				-            self.limit_page_cnt = 20
			
 
				+            self.limit_page_cnt = 50
			
 
				         else:
			
 
				             # 使用start_page_no,end_page_no范围控制，例如2,5
			
 
				             ss = self.need_page_no.split(',')
			
@@ -120,6 +126,12 @@ class PDFConvert:
 
				         # 初始化_page
			
 
				         self._page = _Page(None, 0)
			
 
				 
			
 
				+        # 需要直接转成image来识别的页面
			
 
				+        if type(page_need_to_image_dict) is not dict:
			
 
				+            self.page_need_to_image_dict = {}
			
 
				+        else:
			
 
				+            self.page_need_to_image_dict = page_need_to_image_dict
			
 
				+
			
 
				     @memory_decorator
			
 
				     def init_package(self, package_name):
			
 
				         # 各个包初始化
			
@@ -128,7 +140,9 @@ class PDFConvert:
 
				                                 char_margin=0.3,
			
 
				                                 line_margin=0.01,
			
 
				                                 word_margin=0.01,
			
 
				-                                boxes_flow=0.1, )
			
 
				+                                # boxes_flow=0.1,
			
 
				+                                boxes_flow=None,
			
 
				+                                )
			
 
				             if package_name == self.packages[0]:
			
 
				                 self.doc_pdfminer, self.device, self.interpreter = read_pdfminer(self.path, laparams)
			
 
				                 self.has_init_pdf[0] = 1
			
@@ -153,7 +167,7 @@ class PDFConvert:
 
				             self._doc.error_code = [-3]
			
 
				 
			
 
				     @memory_decorator
			
 
				-    def convert(self, limit_page_cnt=20):
			
 
				+    def convert(self, limit_page_cnt=50):
			
 
				         if self.has_init_pdf[0] == 0:
			
 
				             self.init_package("pdfminer")
			
 
				         if self._doc.error_code is not None:
			
@@ -201,8 +215,11 @@ class PDFConvert:
 
				                     continue
			
 
				             # 限制pdf页数，只取前后各10页
			
 
				             else:
			
 
				-                if page_count > limit_page_cnt and int(limit_page_cnt / 2) <= page_no < page_count - int(
			
 
				-                        limit_page_cnt / 2):
			
 
				+                # if page_count > limit_page_cnt and int(limit_page_cnt / 2) <= page_no < page_count - int(
			
 
				+                #         limit_page_cnt / 2):
			
 
				+                #     page_no += 1
			
 
				+                #     continue
			
 
				+                if page_count > limit_page_cnt and page_no >= limit_page_cnt:
			
 
				                     page_no += 1
			
 
				                     continue
			
 
				 
			
@@ -222,6 +239,8 @@ class PDFConvert:
 
				         delete_water_mark_list = []
			
 
				 
			
 
				         for layout, layout_obj_list, max_y, page_no in layout_list:
			
 
				+            # for obj in layout_obj_list:
			
 
				+            #     print('obj', obj)
			
 
				             # 解析单页
			
 
				             start_time = time.time()
			
 
				             self._page = _Page(None, page_no)
			
@@ -251,7 +270,10 @@ class PDFConvert:
 
				                 find_flag = 0
			
 
				                 add_page_list = []
			
 
				                 for page in pages:
			
 
				-                    if not int(limit_page_cnt / 2) <= page_no < page_count - int(limit_page_cnt / 2):
			
 
				+                    # if not int(limit_page_cnt / 2) <= page_no < page_count - int(limit_page_cnt / 2):
			
 
				+                    #     page_no += 1
			
 
				+                    #     continue
			
 
				+                    if not (page_no >= limit_page_cnt):
			
 
				                         page_no += 1
			
 
				                         continue
			
 
				 
			
@@ -297,9 +319,11 @@ class PDFConvert:
 
				                     page_no += 1
			
 
				 
			
 
				                 if add_page_list:
			
 
				-                    self._doc.children = self._doc.children[
			
 
				-                                         :int(limit_page_cnt / 2)] + add_page_list + self._doc.children[
			
 
				-                                                                                     int(limit_page_cnt / 2):]
			
 
				+                    # self._doc.children = self._doc.children[:int(limit_page_cnt / 2)] \
			
 
				+                    #                      + add_page_list \
			
 
				+                    #                      + self._doc.children[int(limit_page_cnt / 2):]
			
 
				+                    self._doc.children = self._doc.children[:limit_page_cnt] \
			
 
				+                                         + add_page_list
			
 
				 
			
 
				         self.delete_same_image()
			
 
				         # self.delete_bold_text_duplicate()
			
@@ -375,10 +399,14 @@ class PDFConvert:
 
				 
			
 
				         return pages, delete_footer_header_list
			
 
				 
			
 
				+    @memory_decorator
			
 
				     def delete_bold_text_duplicate(self, lt_text_box_list):
			
 
				         # 拿出所有LTChar
			
 
				         lt_char_list = []
			
 
				         for lt_text_box in lt_text_box_list:
			
 
				+            if '.......' in lt_text_box.get_text():
			
 
				+                # print('....... lt_text_box continue')
			
 
				+                continue
			
 
				             for lt_text_line in lt_text_box:
			
 
				                 for lt_char in lt_text_line:
			
 
				                     if isinstance(lt_char, LTChar):
			
@@ -447,14 +475,16 @@ class PDFConvert:
 
				     def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
			
 
				         list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list,
			
 
				                                                                                     from_pdf=True, is_reverse=False)
			
 
				-        self._page.in_table_objs = filter_objs
			
 
				+        # self._page.in_table_objs = filter_objs
			
 
				 
			
 
				         # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
			
 
				 
			
 
				+        table_list = []
			
 
				         for table in list_tables:
			
 
				             _table = _Table(table["table"], table["bbox"])
			
 
				             # self._page.children.append(_table)
			
 
				             self._page.add_child(_table)
			
 
				+            table_list.append(_table)
			
 
				 
			
 
				         list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
			
 
				                                                         layout.bbox, page_no)
			
@@ -466,7 +496,7 @@ class PDFConvert:
 
				         # pdf对象需反向排序
			
 
				         # self._page.is_reverse = True
			
 
				 
			
 
				-        return list_tables
			
 
				+        return table_list
			
 
				 
			
 
				     def is_text_legal(self, lt_text_list, page_no):
			
 
				         # 无法识别pdf字符编码，整页用ocr
			
@@ -498,10 +528,11 @@ class PDFConvert:
 
				 
			
 
				         return True
			
 
				 
			
 
				+    @memory_decorator
			
 
				     def judge_b_table(self, lt_text_list, table_list, page_no):
			
 
				         table_h_list = []
			
 
				         for table in table_list:
			
 
				-            table_h_list.append([table.get('bbox')[1], table.get('bbox')[3]])
			
 
				+            table_h_list.append([table.bbox[1], table.bbox[3]])
			
 
				 
			
 
				         # 先分行
			
 
				         lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
			
@@ -528,6 +559,8 @@ class PDFConvert:
 
				         row_cnt = 0
			
 
				         b_table_row_list = []
			
 
				         all_b_table = []
			
 
				+        row_col_list = []
			
 
				+        all_row_col_list = []
			
 
				         for row in lt_text_row_list:
			
 
				             # 水印行跳过
			
 
				             if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
			
@@ -537,6 +570,7 @@ class PDFConvert:
 
				             for r in row:
			
 
				                 if re.search('[.·]{7,}', r.get_text()):
			
 
				                     continue_flag = True
			
 
				+                    all_row_col_list = []
			
 
				                     break
			
 
				             if continue_flag:
			
 
				                 continue
			
@@ -550,6 +584,7 @@ class PDFConvert:
 
				                     row_cnt += 1
			
 
				                     t_cnt = 0
			
 
				                     b_table_row_list += row
			
 
				+                    row_col_list += [row]
			
 
				                 else:
			
 
				                     # 容忍
			
 
				                     if t_cnt < tolerate_cnt:
			
@@ -557,15 +592,36 @@ class PDFConvert:
 
				                         continue
			
 
				                     if b_table_row_list and row_cnt >= is_b_table_cnt:
			
 
				                         all_b_table.append(b_table_row_list)
			
 
				+                        all_row_col_list.append(row_col_list)
			
 
				                     row_cnt = 0
			
 
				                     b_table_row_list = []
			
 
				+                    row_col_list = []
			
 
				             else:
			
 
				                 row_cnt += 1
			
 
				                 t_cnt = 0
			
 
				                 b_table_row_list += row
			
 
				+                row_col_list += [row]
			
 
				 
			
 
				         if b_table_row_list and row_cnt >= is_b_table_cnt:
			
 
				             all_b_table.append(b_table_row_list)
			
 
				+            all_row_col_list.append(row_col_list)
			
 
				+            # print('b_table_row_list', b_table_row_list)
			
 
				+
			
 
				+        # 排除大部分是两列的，因为前面已经新增了两列无边框的单独识别
			
 
				+        # print('len(all_row_col_list)', len(all_row_col_list))
			
 
				+        row_cnt = 0
			
 
				+        col_2_cnt = 0
			
 
				+        for row_col_list in all_row_col_list:
			
 
				+            for col_list in row_col_list:
			
 
				+                row_cnt += 1
			
 
				+                if len(col_list) == 2:
			
 
				+                    col_2_cnt += 1
			
 
				+                # print('col_list', col_list)
			
 
				+
			
 
				+        # print('row_cnt, col_2_cnt', row_cnt, col_2_cnt)
			
 
				+        if row_cnt == 0 or col_2_cnt / row_cnt >= 0.5:
			
 
				+            log("page_no: " + str(page_no) + ' is_b_table_flag False')
			
 
				+            return False
			
 
				 
			
 
				         # 对每个可能的b_table判断是否与table相交
			
 
				         is_b_table_flag = False
			
@@ -587,8 +643,35 @@ class PDFConvert:
 
				                 # print('table_h_list', table_h_list)
			
 
				                 break
			
 
				         log("page_no: " + str(page_no) + ' is_b_table_flag ' + str(is_b_table_flag))
			
 
				+        # 保存判断为True的pdf
			
 
				+        # if is_b_table_flag:
			
 
				+        #     self.save_b_table_pdf(page_no)
			
 
				         return is_b_table_flag
			
 
				 
			
 
				+    def save_b_table_pdf(self, page_no):
			
 
				+        # save_dir = r"D:\Project\format_conversion_maxcompute\save_b_table_pdf"
			
 
				+        save_dir = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table_pdf'
			
 
				+        max_index = 200
			
 
				+        if os.path.exists(save_dir):
			
 
				+            file_list = glob(save_dir + '/*')
			
 
				+            if file_list:
			
 
				+                file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
			
 
				+                file_index_list.sort(key=lambda x: x)
			
 
				+                index = file_index_list[-1] + 1
			
 
				+            else:
			
 
				+                index = 0
			
 
				+            if index > max_index:
			
 
				+                return
			
 
				+        else:
			
 
				+            return
			
 
				+
			
 
				+        save_path = f'{save_dir}/{index}-{page_no}.pdf'
			
 
				+        try:
			
 
				+            shutil.copy(self.path, save_path)
			
 
				+            print("文件复制成功！")
			
 
				+        except Exception as e:
			
 
				+            print(f"文件复制失败：{e}")
			
 
				+
			
 
				     def char_to_text_box(self, char_list):
			
 
				         lt_text_box_list = []
			
 
				 
			
@@ -646,6 +729,7 @@ class PDFConvert:
 
				 
			
 
				         return lt_text_box_list, text_box_char_dict
			
 
				 
			
 
				+    @memory_decorator
			
 
				     def get_need_objs(self, obj_list, max_y):
			
 
				         # 文字
			
 
				         lt_char_list = []
			
@@ -695,6 +779,14 @@ class PDFConvert:
 
				             elif isinstance(x, (LTTextContainer, LTRect, LTLine, LTCurve)):
			
 
				                 lt_line_list.append(x)
			
 
				 
			
 
				+        # print('len(obj_list)', len(obj_list))
			
 
				+        # print('len(lt_char_list)', len(lt_char_list))
			
 
				+        # print('len(lt_text_box_list)', len(lt_text_box_list))
			
 
				+        # if len(lt_text_box_list) >= 200:
			
 
				+        #     for lt_text in lt_text_box_list:
			
 
				+        #         print('>= 200 lt_text', lt_text.get_text())
			
 
				+        # print('len(lt_image_list)', len(lt_image_list))
			
 
				+
			
 
				         if lt_figure_list:
			
 
				             temp_figure_list = []
			
 
				             for sub_figure in lt_figure_list:
			
@@ -719,8 +811,21 @@ class PDFConvert:
 
				 
			
 
				         text_box_char_dict = {**text_box_char_dict, **add_text_box_char_dict}
			
 
				 
			
 
				+        lt_text_box_list = self.delete_water_mark_by_location(lt_text_box_list)
			
 
				+
			
 
				+        # 分行后过滤
			
 
				+        temp_list = []
			
 
				+        for lt_text_box in lt_text_box_list:
			
 
				+            if lt_text_box.get_text() in ['', ' ', '\t', '\n', '\r']:
			
 
				+                continue
			
 
				+            temp_list.append(lt_text_box)
			
 
				+        if len(lt_text_box_list) != len(temp_list):
			
 
				+            log('filter lt_text_box_list ' + str(len(lt_text_box_list)) + ' -> ' + str(len(temp_list)))
			
 
				+        lt_text_box_list = temp_list
			
 
				+
			
 
				         return lt_char_list, lt_text_box_list, lt_image_list, lt_figure_list, lt_line_list, text_box_char_dict
			
 
				 
			
 
				+    @memory_decorator
			
 
				     def read_layout(self, page, page_no):
			
 
				         layout = self.get_layout(page, page_no)
			
 
				         if self._doc.error_code is not None:
			
@@ -834,6 +939,7 @@ class PDFConvert:
 
				 
			
 
				         return lt_text_box_list
			
 
				 
			
 
				+    @memory_decorator
			
 
				     def split_text_box_by_lines2(self, lt_line_list, lt_text_box_list, text_box_char_dict):
			
 
				         """
			
 
				         有单个字符位置信息，再根据表格线截断位置，分割text
			
@@ -932,12 +1038,23 @@ class PDFConvert:
 
				         return lt_text_box_list
			
 
				 
			
 
				     @memory_decorator
			
 
				-    # def convert_page(self, page, page_no, skip_image=0):
			
 
				     def convert_page(self, layout, layout_obj_list, max_y, page_no, delete_water_mark_list, skip_image=0):
			
 
				         # 若Page中一个obj都无，后面ocr整页识别 20240820
			
 
				         if max_y == 0 and len(layout_obj_list) > 0:
			
 
				             return
			
 
				 
			
 
				+        # 若该页在page_need_to_image_dict中为True，则直接ocr整页识别
			
 
				+        if self.page_need_to_image_dict.get(page_no) is True:
			
 
				+            page_image = self.get_page_image(page_no)
			
 
				+            if judge_error_code(page_image):
			
 
				+                self._page.error_code = page_image
			
 
				+            else:
			
 
				+                _image = _Image(page_image[1], page_image[0])
			
 
				+                _image.is_from_pdf = True
			
 
				+                _image.is_reverse = False
			
 
				+                self._page.add_child(_image)
			
 
				+            return
			
 
				+
			
 
				         lt_char_list, lt_text_box_list, lt_image_list, lt_figure_list, \
			
 
				             lt_line_list, text_box_char_dict = layout_obj_list
			
 
				 
			
@@ -999,45 +1116,56 @@ class PDFConvert:
 
				         # 正常读取该页对象
			
 
				         else:
			
 
				             # 图表对象
			
 
				-            for image in lt_image_list:
			
 
				-                try:
			
 
				-                    # print("pdf2text LTImage size", page_no, image.width, image.height)
			
 
				-                    image_stream = image.stream.get_data()
			
 
				-                    # 小的图忽略
			
 
				-                    if image.width <= 300 and image.height <= 300:
			
 
				-                        continue
			
 
				-                    # 查看提取的图片高宽，太大则用pdf输出图进行ocr识别
			
 
				-                    img_test = Image.open(io.BytesIO(image_stream))
			
 
				-                    if image.height >= 1000 and image.width >= 1000:
			
 
				-                        page_image = self.get_page_image(page_no)
			
 
				-                        if judge_error_code(page_image):
			
 
				-                            self._page.error_code = page_image
			
 
				-                        else:
			
 
				-                            _image = _Image(page_image[1], page_image[0])
			
 
				-                            _image.is_from_pdf = True
			
 
				-                            _image.is_reverse = False
			
 
				-                            self._page.add_child(_image)
			
 
				-                            image_md5 = get_md5_from_bytes(page_image[1])
			
 
				-                            self.md5_image_obj_list.append([image_md5, _image])
			
 
				-                        return
			
 
				-                    # 比较小的图则直接保存用ocr识别
			
 
				-                    else:
			
 
				-                        temp_path = self.unique_type_dir + 'page' + str(page_no) \
			
 
				-                                    + '_lt' + str(lt_image_list.index(image)) + '.jpg'
			
 
				-                        img_test.save(temp_path)
			
 
				-                        with open(temp_path, "rb") as ff:
			
 
				-                            image_stream = ff.read()
			
 
				-                        _image = _Image(image_stream, temp_path, image.bbox)
			
 
				-                        self._page.add_child(_image)
			
 
				-                        image_md5 = get_md5_from_bytes(image_stream)
			
 
				-                        self.md5_image_obj_list.append([image_md5, _image])
			
 
				-                except Exception:
			
 
				-                    log("page_no: " + str(page_no) + " pdfminer read image fail! use pymupdf read image...")
			
 
				-                    traceback.print_exc()
			
 
				+            # for image in lt_image_list:
			
 
				+            #     try:
			
 
				+            #         # print("pdf2text LTImage size", page_no, image.width, image.height)
			
 
				+            #         # image_stream = image.stream.get_data()
			
 
				+            #         print('image.stream.get_filters()', image.stream.get_filters())
			
 
				+            #         image_stream = image.stream.get_data()
			
 
				+            #         # 小的图忽略
			
 
				+            #         if image.width <= 300 and image.height <= 300:
			
 
				+            #             continue
			
 
				+            #         # 查看提取的图片高宽，太大则用pdf输出图进行ocr识别
			
 
				+            #         img_test = Image.open(io.BytesIO(image_stream))
			
 
				+            #         # img_test = self.pdfminer_stream_to_image(image)
			
 
				+            #         if image.height >= 1000 and image.width >= 1000:
			
 
				+            #             page_image = self.get_page_image(page_no)
			
 
				+            #             if judge_error_code(page_image):
			
 
				+            #                 self._page.error_code = page_image
			
 
				+            #             else:
			
 
				+            #                 _image = _Image(page_image[1], page_image[0])
			
 
				+            #                 _image.is_from_pdf = True
			
 
				+            #                 _image.is_reverse = False
			
 
				+            #                 self._page.add_child(_image)
			
 
				+            #                 image_md5 = get_md5_from_bytes(page_image[1])
			
 
				+            #                 self.md5_image_obj_list.append([image_md5, _image])
			
 
				+            #             return
			
 
				+            #         # 比较小的图则直接保存用ocr识别
			
 
				+            #         else:
			
 
				+            #             temp_path = self.unique_type_dir + 'page' + str(page_no) \
			
 
				+            #                         + '_lt' + str(lt_image_list.index(image)) + '.jpg'
			
 
				+            #             img_test.save(temp_path)
			
 
				+            #             with open(temp_path, "rb") as ff:
			
 
				+            #                 image_stream = ff.read()
			
 
				+            #             _image = _Image(image_stream, temp_path, image.bbox)
			
 
				+            #             self._page.add_child(_image)
			
 
				+            #             image_md5 = get_md5_from_bytes(image_stream)
			
 
				+            #             self.md5_image_obj_list.append([image_md5, _image])
			
 
				+            #     except Exception:
			
 
				+            #         log("page_no: " + str(page_no) + " pdfminer read image fail! use pymupdf read image...")
			
 
				+            #         traceback.print_exc()
			
 
				 
			
 
				             # pdf对象需反向排序
			
 
				             # self._page.is_reverse = True
			
 
				 
			
 
				+            status = self.pdfminer_read_page_images(lt_image_list, page_no)
			
 
				+            if not status:
			
 
				+                log('pymupdf 提取页面中图片 page_no: ' + str(page_no))
			
 
				+                status = self.pymupdf_read_page_images(page_no)
			
 
				+            if not status:
			
 
				+                log('pymupdf 整页转化为图片 page_no: ' + str(page_no))
			
 
				+                status = self.pymupdf_get_whole_page_image(page_no)
			
 
				+
			
 
				             if self.has_init_pdf[3] == 0:
			
 
				                 self.init_package("pdfplumber")
			
 
				 
			
@@ -1059,7 +1187,24 @@ class PDFConvert:
 
				             table_list = self.recognize_text(layout, page_no, lt_text_box_list, lt_line_list)
			
 
				 
			
 
				             # 根据text规律，判断该页是否可能有无边框表格
			
 
				+            try:
			
 
				+                b_table_list, _ = get_b_table_by_blank_colon(lt_text_box_list, table_list, layout.bbox, None)
			
 
				+            except:
			
 
				+                traceback.print_exc()
			
 
				+                b_table_list = []
			
 
				+                self._page.error_code = [-23]
			
 
				+
			
 
				+            if b_table_list:
			
 
				+                for table in b_table_list:
			
 
				+                    _table = _Table(table[0], table[1])
			
 
				+                    table_list += [_table]
			
 
				+                    self._page.add_child(_table)
			
 
				+
			
 
				+            for t in table_list:
			
 
				+                self._page.table_bbox_list.append(t.bbox)
			
 
				+
			
 
				             if self.judge_b_table(lt_text_box_list, table_list, page_no):
			
 
				+                # log('judge_b_table match! ' + str(page_no))
			
 
				                 page_image = self.get_page_image(page_no)
			
 
				                 if judge_error_code(page_image):
			
 
				                     self._page.error_code = page_image
			
@@ -1073,6 +1218,7 @@ class PDFConvert:
 
				                     _image.b_table_layout_size = (layout.width, layout.height)
			
 
				                     self._page.add_child(_image)
			
 
				 
			
 
				+    @memory_decorator
			
 
				     def get_layout(self, page, page_no):
			
 
				         if self.has_init_pdf[0] == 0:
			
 
				             self.init_package("pdfminer")
			
@@ -1096,6 +1242,7 @@ class PDFConvert:
 
				         log("page_no: " + str(page_no) + " get_layout cost: " + str(time.time() - start_time))
			
 
				         return layout
			
 
				 
			
 
				+    @memory_decorator
			
 
				     def get_page_image(self, page_no):
			
 
				         start_time = time.time()
			
 
				         try:
			
@@ -1503,6 +1650,7 @@ class PDFConvert:
 
				             return [-12]
			
 
				         return html
			
 
				 
			
 
				+    @memory_decorator
			
 
				     def delete_water_mark(self, lt_text_list, page_bbox, times=5):
			
 
				         # 删除过多重复字句，为水印
			
 
				         duplicate_dict = {}
			
@@ -1540,6 +1688,32 @@ class PDFConvert:
 
				                 temp_text_list.append(_obj)
			
 
				         return temp_text_list, delete_text
			
 
				 
			
 
				+    @memory_decorator
			
 
				+    def delete_water_mark_by_location(self, lt_text_box_list):
			
 
				+        x_text_box_dict = {}
			
 
				+        # 水印，x坐标相同，且长度为1
			
 
				+        for lt_text_box in lt_text_box_list:
			
 
				+            x1, y1, x2, y2 = lt_text_box.bbox
			
 
				+            text = lt_text_box.get_text()
			
 
				+            if len(text) != 1:
			
 
				+                continue
			
 
				+            key = f'{x1}-{x2}-{text}'
			
 
				+            if key in x_text_box_dict:
			
 
				+                x_text_box_dict[key] += [lt_text_box]
			
 
				+            else:
			
 
				+                x_text_box_dict[key] = [lt_text_box]
			
 
				+
			
 
				+        len1 = len(lt_text_box_list)
			
 
				+        for key, box_list in x_text_box_dict.items():
			
 
				+            if len(box_list) >= 3:
			
 
				+                for box in box_list:
			
 
				+                    if box in lt_text_box_list:
			
 
				+                        lt_text_box_list.remove(box)
			
 
				+        len2 = len(lt_text_box_list)
			
 
				+        if len1 != len2:
			
 
				+            log('delete_water_mark_by_location box num ' + str(len1) + ' -> ' + str(len2))
			
 
				+        return lt_text_box_list
			
 
				+
			
 
				     def delete_water_mark_by_color(self, lt_text_list):
			
 
				         # 删除浅色字体，大概率为水印
			
 
				         # 1. 单个char颜色透明度0.8以上
			
@@ -1587,6 +1761,9 @@ class PDFConvert:
 
				         water_mark_text_box_list = []
			
 
				         sin_range = [0.3, 0.94]
			
 
				         for lt_text_box in lt_text_list:
			
 
				+            if '.......' in lt_text_box.get_text():
			
 
				+                # print('....... lt_text_box continue')
			
 
				+                continue
			
 
				             for lt_text_line in lt_text_box:
			
 
				                 for lt_char in lt_text_line:
			
 
				                     matrix = lt_char.matrix
			
@@ -1634,6 +1811,126 @@ class PDFConvert:
 
				             log("page_no: " + str(page_no) + " get_single_pdf error!")
			
 
				             return [-3]
			
 
				 
			
 
				+    def pymupdf_read_page_images(self, page_no):
			
 
				+        try:
			
 
				+            self.init_package("PyMuPDF")
			
 
				+            # 获取指定页面
			
 
				+            page = self.doc_pymupdf.load_page(page_no)
			
 
				+            # 获取页面中所有图片的信息
			
 
				+            image_list = page.get_images(full=True)
			
 
				+
			
 
				+            # 存储提取的图片信息
			
 
				+            extracted_images = []
			
 
				+
			
 
				+            # 遍历图片列表
			
 
				+            for img_index, img_info in enumerate(image_list):
			
 
				+                xref = img_info[0]  # 图片xref编号
			
 
				+                base_image = self.doc_pymupdf.extract_image(xref)
			
 
				+                image_bytes = base_image["image"]  # 图片字节数据
			
 
				+                image_ext = base_image["ext"]  # 图片扩展名
			
 
				+
			
 
				+                # 获取图片在页面中的位置和大小
			
 
				+                bbox = img_info[0:4]  # x0, y0, x1, y1
			
 
				+                # print('img_info', img_info)
			
 
				+                width = img_info[2] - img_info[0]  # 计算宽度
			
 
				+                height = img_info[3] - img_info[1]  # 计算高度
			
 
				+
			
 
				+                # 构建图片信息字典
			
 
				+                img_data = {
			
 
				+                    "xref": xref,
			
 
				+                    "width": width,
			
 
				+                    "height": height,
			
 
				+                    "image": image_bytes,
			
 
				+                    "ext": image_ext,
			
 
				+                    "bbox": bbox
			
 
				+                }
			
 
				+                extracted_images.append(img_data)
			
 
				+
			
 
				+            image_obj_list = []
			
 
				+            for index, d in enumerate(extracted_images):
			
 
				+                temp_path = self.unique_type_dir + 'page' + str(page_no) \
			
 
				+                            + '_lt2_' + str(index) + '.jpg'
			
 
				+                image_bytes = d.get("image")
			
 
				+                bbox = d.get('bbox')
			
 
				+                with open(temp_path, 'wb') as f:
			
 
				+                    f.write(image_bytes)
			
 
				+
			
 
				+                _image = _Image(image_bytes, temp_path, bbox)
			
 
				+                image_md5 = get_md5_from_bytes(image_bytes)
			
 
				+                image_obj_list.append([_image, image_md5])
			
 
				+        except:
			
 
				+            traceback.print_exc()
			
 
				+            return False
			
 
				+
			
 
				+        for _image, image_md5 in image_obj_list:
			
 
				+            self._page.add_child(_image)
			
 
				+            self.md5_image_obj_list.append([image_md5, _image])
			
 
				+        return True
			
 
				+
			
 
				+    def pymupdf_get_whole_page_image(self, page_no):
			
 
				+        image_obj_list = []
			
 
				+        page_image = self.get_page_image(page_no)
			
 
				+        if judge_error_code(page_image):
			
 
				+            self._page.error_code = page_image
			
 
				+            return False
			
 
				+        else:
			
 
				+            _image = _Image(page_image[1], page_image[0])
			
 
				+            _image.is_from_pdf = True
			
 
				+            _image.is_reverse = False
			
 
				+            image_md5 = get_md5_from_bytes(page_image[1])
			
 
				+            image_obj_list.append([_image, image_md5])
			
 
				+
			
 
				+        for _image, image_md5 in image_obj_list:
			
 
				+            self._page.add_child(_image)
			
 
				+            self.md5_image_obj_list.append([image_md5, _image])
			
 
				+        return True
			
 
				+
			
 
				+    def pdfminer_read_page_images(self, lt_image_list, page_no):
			
 
				+        # 图表对象
			
 
				+        image_obj_list = []
			
 
				+        for image in lt_image_list:
			
 
				+            try:
			
 
				+                # print("pdf2text LTImage size", page_no, image.width, image.height)
			
 
				+                # image_stream = image.stream.get_data()
			
 
				+                # print('image.stream.get_filters()', image.stream.get_filters())
			
 
				+                image_stream = image.stream.get_data()
			
 
				+                # 小的图忽略
			
 
				+                if image.width <= 300 and image.height <= 300:
			
 
				+                    continue
			
 
				+                # 查看提取的图片高宽，太大则用pdf输出图进行ocr识别
			
 
				+                img_test = Image.open(io.BytesIO(image_stream))
			
 
				+                # img_test = self.pdfminer_stream_to_image(image)
			
 
				+                # if image.height >= 1000 and image.width >= 1000:
			
 
				+                #     page_image = self.get_page_image(page_no)
			
 
				+                #     if judge_error_code(page_image):
			
 
				+                #         self._page.error_code = page_image
			
 
				+                #     else:
			
 
				+                #         _image = _Image(page_image[1], page_image[0])
			
 
				+                #         _image.is_from_pdf = True
			
 
				+                #         _image.is_reverse = False
			
 
				+                #         image_md5 = get_md5_from_bytes(page_image[1])
			
 
				+                #         image_obj_list.append([_image, image_md5])
			
 
				+                # # 比较小的图则直接保存用ocr识别
			
 
				+                # else:
			
 
				+                temp_path = self.unique_type_dir + 'page' + str(page_no) \
			
 
				+                            + '_lt_' + str(lt_image_list.index(image)) + '.jpg'
			
 
				+                img_test.save(temp_path)
			
 
				+                with open(temp_path, "rb") as ff:
			
 
				+                    image_stream = ff.read()
			
 
				+                _image = _Image(image_stream, temp_path, image.bbox)
			
 
				+                self._page.add_child(_image)
			
 
				+                image_md5 = get_md5_from_bytes(image_stream)
			
 
				+                self.md5_image_obj_list.append([image_md5, _image])
			
 
				+            except Exception:
			
 
				+                log("page_no: " + str(page_no) + " pdfminer read image fail!")
			
 
				+                traceback.print_exc()
			
 
				+                return False
			
 
				+
			
 
				+        for _image, image_md5 in image_obj_list:
			
 
				+            self._page.add_child(_image)
			
 
				+            self.md5_image_obj_list.append([image_md5, _image])
			
 
				+        return True
			
 
				+
			
 
				 
			
 
				 def get_text_font():
			
 
				     def flags_decomposer(flags):
			
@@ -1999,4 +2296,8 @@ class ParseUtils:
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    PDFConvert(r"C:/Users/Administrator/Downloads/1651896704621.pdf", "C:/Users/Administrator/Downloads/1").get_html()
			
 
				+    _pp = r'D:\Project\format_conversion_maxcompute\save_b_table_pdf/e-116.pdf'
			
 
				+    # _pp = r'C:\Users\Administrator\Downloads\1746582280828.pdf'
			
 
				+    _html = PDFConvert(_pp, r"D:\Project\format_conversion_maxcompute\format_convert\temp", None).get_html()
			
 
				+    with open('../result.html', 'w', encoding='utf-8') as f:
			
 
				+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])
			
--- a/format_convert/convert_test.py
+++ b/format_convert/convert_test.py
@@ -11,15 +11,6 @@ from glob import glob
 
				 import requests
			
 
				 
			
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
 
				-from pdfminer.converter import PDFPageAggregator
			
 
				-from pdfminer.layout import LAParams, LTLine
			
 
				-from pdfminer.pdfdocument import PDFDocument
			
 
				-from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
			
 
				-from pdfminer.pdfpage import PDFPage
			
 
				-from pdfminer.pdfparser import PDFParser
			
 
				-from pdfplumber import PDF
			
 
				-
			
 
				-from otr.table_line_pdf import _plot
			
 
				 
			
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
 
				 from format_convert.utils import get_platform, request_post, get_md5_from_bytes
			
@@ -44,7 +35,7 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
 
				     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range,
			
 
				             'timeout': timeout, 'save_middle': save_middle}
			
 
				 
			
 
				-    # _url = 'http://121.46.18.113:15010/convert'
			
 
				+    # _url = 'http://dianxin.bidizhaobiao.com:15010/convert'
			
 
				     # _url = 'http://192.168.2.103:15010/convert'
			
 
				     # _url = 'http://192.168.2.102:15010/convert'
			
 
				     # _url = 'http://172.16.160.65:15010/convert'
			
@@ -53,7 +44,7 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
 
				     text_str = ""
			
 
				     try:
			
 
				         result = json.loads(request_post(_url, data, time_out=timeout+20))
			
 
				-
			
 
				+        print('result', result)
			
 
				         for t in result.get("result_html"):
			
 
				             text_str += t
			
 
				         to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
			
@@ -67,7 +58,7 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
 
				                 to_html(new_path, text_str)
			
 
				 
			
 
				         print(_md5)
			
 
				-        print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
			
 
				+        # print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
			
 
				         print("result_text", result.get("result_text")[0][:20])
			
 
				         print("is_success", result.get("is_success"))
			
 
				     except:
			
@@ -80,7 +71,6 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
 
				     return p, 1
			
 
				 
			
 
				 
			
 
				-
			
 
				 def test_path():
			
 
				     # _url = 'http://121.46.18.113:15010/convert'
			
 
				     _url = 'http://192.168.0.115:15010/convert'
			
@@ -186,21 +176,25 @@ def test_kimi():
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     if get_platform() == "Windows":
			
 
				-        # file_path = "C:/Users/Administrator/Downloads/1672314827836.pdf"
			
 
				+        # file_path = "C:/Users/Administrator/Downloads/1750737587843.ofd"
			
 
				+        # file_path = r'D:\Project\format_conversion_maxcompute\save_b_table_pdf/e-1.pdf'
			
 
				         # file_path = "D:/BIDI_DOC/比地_文档/1677829036789.pdf"
			
 
				 
			
 
				-        # file_path = "C:/Users/Administrator/Desktop/test_xls/error7.xls"
			
 
				-        # file_path = "C:/Users/Administrator/Desktop/test_doc/error15.doc"
			
 
				-        # file_path = "C:/Users/Administrator/Desktop/test_swf/error1.swf"
			
 
				+        # file_path = "C:/Users/Administrator/Desktop/test_xls/error4.xlsx"
			
 
				+        # file_path = "C:/Users/Administrator/Desktop/test_doc/error17.docx"
			
 
				+        # file_path = "C:/Users/Administrator/Desktop/test_swf/error2.swf"
			
 
				         # file_path = "C:/Users/Administrator/Desktop/test_rar/error1.rar"
			
 
				-        file_path = "C:/Users/Administrator/Desktop/test_image/error7.png"
			
 
				-        # file_path = "C:/Users/Administrator/Desktop/test_b_table/error13.pdf"
			
 
				-        # file_path = "C:/Users/Administrator/Desktop/test_pdf/表格连接error/error6.pdf"
			
 
				+        # file_path = "C:/Users/Administrator/Desktop/test_image/error18.png"
			
 
				+        # file_path = "C:/Users/Administrator/Desktop/test_b_table/error29.png"
			
 
				+        # file_path = "C:/Users/Administrator/Desktop/test_pdf/普通error/error6.pdf"
			
 
				         # file_path = "C:/Users/Administrator/Desktop/test_table_head/error2.pdf"
			
 
				+        # file_path = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
			
 
				+        file_path = "C:/Users/Administrator/Desktop/test_ofd/1750381792388.ofd"
			
 
				     else:
			
 
				         file_path = "1660296734009.pdf"
			
 
				 
			
 
				-    test_one(file_path, page_no_range='1,-1', timeout=1000, save_middle=None)
			
 
				+    # test_one(file_path, page_no_range="1,-1", timeout=1000, save_middle=None)
			
 
				+    test_one(file_path, page_no_range=None, timeout=1000, save_middle=None)
			
 
				 
			
 
				     # run_files()
			
 
				 
			
@@ -212,21 +206,21 @@ if __name__ == '__main__':
 
				     # file_path = r"C:\Users\Administrator\Desktop\test_pdf\直接读表格线error/"
			
 
				     # file_path = r"C:\Users\Administrator\Desktop\test_pdf\表格连接error/"
			
 
				     # file_path = r"C:\Users\Administrator\Desktop\test_b_table/"
			
 
				-    file_path = r"C:\Users\Administrator\Desktop\test_pdf\普通error/"
			
 
				-    test_pdf_list = [['6df7f2bd5e8cac99a15a6c012e0d82a8.pdf', '34,52'],
			
 
				-                     ['ca6a86753400d6dd6a1b324c5678b7fb.pdf', '18,69'],
			
 
				-                     ['a8380bf795c71caf8185fb11395df138.pdf', '27,38'],
			
 
				-                     ['7fd2ce6b08d086c98158b6f2fa0293b0.pdf', '32,48'],
			
 
				-                     ['dd1adb4dc2014c7abcf403ef15a01eb5.pdf', '2,12'],
			
 
				-                     ['error50.pdf', '1,-1'],
			
 
				-                     ['error59.pdf', '1,-1'],
			
 
				-                     ['error60.pdf', '1,-1'],
			
 
				-                     ['error61.pdf', '1,-1'],
			
 
				-                     ['error7.pdf', '39,57'],
			
 
				-                     ['error8.pdf', '7,12'],
			
 
				-                     ['error23.pdf', '1,-1']
			
 
				-                     ]
			
 
				-    index = 11
			
 
				+    # file_path = r"C:\Users\Administrator\Desktop\test_pdf\普通error/"
			
 
				+    # test_pdf_list = [['6df7f2bd5e8cac99a15a6c012e0d82a8.pdf', '34,52'],
			
 
				+    #                  ['ca6a86753400d6dd6a1b324c5678b7fb.pdf', '18,69'],
			
 
				+    #                  ['a8380bf795c71caf8185fb11395df138.pdf', '27,38'],
			
 
				+    #                  ['7fd2ce6b08d086c98158b6f2fa0293b0.pdf', '32,48'],
			
 
				+    #                  ['dd1adb4dc2014c7abcf403ef15a01eb5.pdf', '2,12'],
			
 
				+    #                  ['error50.pdf', '1,-1'],
			
 
				+    #                  ['error59.pdf', '1,-1'],
			
 
				+    #                  ['error60.pdf', '1,-1'],
			
 
				+    #                  ['error61.pdf', '1,-1'],
			
 
				+    #                  ['error7.pdf', '39,57'],
			
 
				+    #                  ['error8.pdf', '7,12'],
			
 
				+    #                  ['error23.pdf', '1,-1']
			
 
				+    #                  ]
			
 
				+    # index = 11
			
 
				     # test_one(file_path+test_pdf_list[index][0], page_no_range=test_pdf_list[index][1], from_remote=True)
			
 
				 
			
 
				 
			
--- a/format_convert/convert_tree.py
+++ b/format_convert/convert_tree.py
@@ -61,6 +61,8 @@ class _Page:
 
				         self.in_table_objs = set()
			
 
				         # 是否pdf
			
 
				         self.is_pdf = 0
			
 
				+        # 所有表格范围
			
 
				+        self.table_bbox_list = []
			
 
				 
			
 
				     def add_child(self, child):
			
 
				         if child.error_code is None:
			
@@ -74,12 +76,66 @@ class _Page:
 
				 
			
 
				         self.children = sort_object(self.children, self.is_reverse)
			
 
				 
			
 
				+        # 有图片类型，需返回图片中所有对象，并重新设置图片中的bbox，以及图片后的对象的bbox
			
 
				+        image_add_y = 0
			
 
				+        add_childern = []
			
 
				+        for child in self.children:
			
 
				+            if type(child) == _Image:
			
 
				+                image_children = child.get_html(return_children=True)
			
 
				+                if judge_error_code(image_children) and not self.is_pdf:
			
 
				+                    self.error_code = image_children
			
 
				+                    return self.error_code
			
 
				+                if len(image_children) == 0:
			
 
				+                    continue
			
 
				+                image_children = sort_object(image_children, False)
			
 
				+
			
 
				+                # 单张图可能无bbox，但文档中的图有bbox
			
 
				+                if child.bbox != (0, 0, 0, 0):
			
 
				+                    for i_child in image_children:
			
 
				+                        i_child.bbox = [i_child.bbox[0], i_child.bbox[1] + child.bbox[3] + image_add_y,
			
 
				+                                        i_child.bbox[2], i_child.bbox[3] + child.bbox[3] + image_add_y
			
 
				+                                        ]
			
 
				+
			
 
				+                image_add_y += image_children[-1].bbox[3]
			
 
				+                add_childern += image_children
			
 
				+                continue
			
 
				+
			
 
				+            # 图片对象后面的对象，bbox重新设置
			
 
				+            child.bbox = [child.bbox[0], child.bbox[1] + image_add_y,
			
 
				+                          child.bbox[2], child.bbox[3] + image_add_y
			
 
				+                          ]
			
 
				+            # self.children += child.get_html(return_children=True)
			
 
				+
			
 
				+        self.children += add_childern
			
 
				+        self.children = sort_object(self.children, self.is_reverse)
			
 
				+
			
 
				+        # 获取所有table，计算bbox，排除在table中的sentence
			
 
				+        for child in self.children:
			
 
				+            if type(child) == _Table:
			
 
				+                # table_bbox = get_table_bbox(child.content)
			
 
				+                # print('table.content ', child.content)
			
 
				+                # print('child.bbox', child.bbox)
			
 
				+                self.table_bbox_list += [child.bbox]
			
 
				+
			
 
				         html_text = ""
			
 
				         image_html = ""
			
 
				         text_html = ""
			
 
				         for child in self.children:
			
 
				+            if type(child) == _Image:
			
 
				+                continue
			
 
				+            if type(child) == _Sentence:
			
 
				+                continue_flag = 0
			
 
				+                for table_bbox in self.table_bbox_list:
			
 
				+                    # print('table_bbox', table_bbox)
			
 
				+                    if table_bbox[1] - 3 <= child.bbox[1] <= child.bbox[3] <= table_bbox[3] + 3:
			
 
				+                        continue_flag = 1
			
 
				+                        break
			
 
				+                if continue_flag:
			
 
				+                    continue
			
 
				+
			
 
				             # 先调用get_html才能更新error_code
			
 
				             child_html_text = child.get_html()
			
 
				+            # print('sort child_html_text', child_html_text)
			
 
				             if child.error_code is not None:
			
 
				                 self.error_code = child.error_code
			
 
				                 return ""
			
@@ -158,14 +214,16 @@ class _Image:
 
				         else:
			
 
				             self.error_code = child.error_code
			
 
				 
			
 
				-    def get_html(self):
			
 
				+    def get_html(self, return_children=False):
			
 
				         # 将Image转为Sentence,table
			
 
				         self.convert()
			
 
				         # if self.error_code == [-16]:
			
 
				         #     self.error_code = None
			
 
				         #     return "<div>#idc error#<div>"
			
 
				         if self.error_code is not None:
			
 
				-            return ""
			
 
				+            return self.error_code
			
 
				+        if return_children:
			
 
				+            return self.children
			
 
				 
			
 
				         html_text = ""
			
 
				         self.children = sort_object(self.children)
			
@@ -192,7 +250,9 @@ class _Image:
 
				                                  self.b_table_layout_size, self.is_reverse)
			
 
				         if judge_error_code(obj_list):
			
 
				             # 20241101 注释 图片识别报错返回空
			
 
				-            # self.error_code = obj_list
			
 
				+            # 20250604 不是来源pdf的，返回错误码
			
 
				+            if not self.is_from_pdf:
			
 
				+                self.error_code = obj_list
			
 
				             return
			
 
				 
			
 
				         if self.b_table_from_text:
			
@@ -213,9 +273,19 @@ class _Table:
 
				         self.bbox = bbox
			
 
				         self.x = bbox[0]
			
 
				         self.y = bbox[1]
			
 
				-        self.shape = (len(content), len(content[0]))
			
 
				+        if len(content) and len(content[0]):
			
 
				+            self.shape = (len(content), len(content[0]))
			
 
				+        else:
			
 
				+            self.shape = (0, 0)
			
 
				         self.error_code = None
			
 
				 
			
 
				+    def get_table_bbox(self, table):
			
 
				+        x1 = min([y.bbox[0] for x in table for y in x])
			
 
				+        y1 = min([y.bbox[1] for x in table for y in x])
			
 
				+        x2 = max([y.bbox[2] for x in table for y in x])
			
 
				+        y2 = max([y.bbox[3] for x in table for y in x])
			
 
				+        return [x1, y1, x2, y2]
			
 
				+
			
 
				     def get_html(self):
			
 
				         if self.error_code is not None:
			
 
				             return ""
			
@@ -227,6 +297,9 @@ class _Table:
 
				             html_text = get_table_html(self.content)
			
 
				             return html_text
			
 
				 
			
 
				+    def __repr__(self):
			
 
				+        return '(%s@#@%s)' % (str('table'), '@'.join([str(x) for x in self.bbox]))
			
 
				+
			
 
				 
			
 
				 class _Sentence:
			
 
				     def __init__(self, content, bbox, is_html=False):
			
@@ -249,6 +322,9 @@ class _Sentence:
 
				         else:
			
 
				             return add_div(self.content)
			
 
				 
			
 
				+    def __repr__(self):
			
 
				+        return '(%s@#@%s)' % (str(self.content), '@'.join([str(x) for x in self.bbox]))
			
 
				+
			
 
				 
			
 
				 class TextBox:
			
 
				     def __init__(self, bbox, text):
			
@@ -261,6 +337,17 @@ class TextBox:
 
				     def __str__(self):
			
 
				         return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))
			
 
				 
			
 
				+    def __repr__(self):
			
 
				+        return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))
			
 
				+
			
 
				+    def __hash__(self):
			
 
				+        return hash(self.__str__())
			
 
				+
			
 
				+    def __eq__(self, other):
			
 
				+        if isinstance(other, TextBox):
			
 
				+            return self.__str__() == other.__str__()
			
 
				+        return False
			
 
				+
			
 
				 
			
 
				 class TableLine:
			
 
				     def __init__(self, bbox):
			
--- a/format_convert/convert_wps.py
+++ b/format_convert/convert_wps.py
@@ -0,0 +1,61 @@
 
				+import os
			
 
				+import re
			
 
				+import sys
			
 
				+
			
 
				+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
			
 
				+from format_convert.convert_tree import _Document, _Sentence, _Page
			
 
				+import logging
			
 
				+import traceback
			
 
				+from format_convert.convert_doc import DocConvert
			
 
				+from format_convert.utils import judge_error_code, get_logger, log
			
 
				+
			
 
				+
			
 
				+class WpsConvert:
			
 
				+    def __init__(self, path, unique_type_dir):
			
 
				+        self._doc = _Document(path)
			
 
				+        self.path = path
			
 
				+        self.unique_type_dir = unique_type_dir
			
 
				+
			
 
				+    def convert(self):
			
 
				+        # 改后缀，调用doc处理
			
 
				+        print('self.path', self.path)
			
 
				+        file_name = re.split('[/\\\]', self.path)[-1]
			
 
				+        with open(self.path, 'rb') as file:
			
 
				+            content = file.read()
			
 
				+
			
 
				+        new_file_name = file_name[:-4] + '.doc'
			
 
				+        new_file_path = self.unique_type_dir + new_file_name
			
 
				+        print('new_file_path', new_file_path)
			
 
				+        with open(new_file_path, 'wb') as file:
			
 
				+            file.write(content)
			
 
				+
			
 
				+        log('wps file ' + file_name + ' -> ' + new_file_name)
			
 
				+
			
 
				+        self._doc_convert = DocConvert(new_file_path, self.unique_type_dir)
			
 
				+        self._doc_convert.convert()
			
 
				+        self._doc = self._doc_convert._doc
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        try:
			
 
				+            self.convert()
			
 
				+        except:
			
 
				+            traceback.print_exc()
			
 
				+            self._doc.error_code = [-1]
			
 
				+
			
 
				+        # 直接返回doc处理的html
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return self._doc.error_code
			
 
				+        else:
			
 
				+            return self._doc.get_html()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    _p = "C:/Users/Administrator/Downloads/1723004790329.wps"
			
 
				+    # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
			
 
				+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
			
 
				+    c = WpsConvert(_p, save_dir)
			
 
				+    _html = c.get_html()
			
 
				+    with open('../result.html', 'w', encoding='utf-8') as f:
			
 
				+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])
			
 
				+
			
 
				+
			
--- a/format_convert/easyofd/easyofd/__init__.py
+++ b/format_convert/easyofd/easyofd/__init__.py
@@ -0,0 +1,6 @@
 
				+from .ofd import OFD
			
 
				+__version__ = "0.5.1"
			
 
				+__author__ = "renoyuan"
			
 
				+__email__ = "renoyuan@foxmail.com"
			
 
				+__description__ = "一个用于OFD文档处理的Python库"
			
 
				+__all__ = ["OFD"]
			
--- a/format_convert/easyofd/easyofd/chinese_characters.txt
+++ b/format_convert/easyofd/easyofd/chinese_characters.txt
@@ -0,0 +1,474 @@
 
				+豈
			
 
				+更
			
 
				+車
			
 
				+賈
			
 
				+滑
			
 
				+串
			
 
				+句
			
 
				+龜
			
 
				+龜
			
 
				+契
			
 
				+金
			
 
				+喇
			
 
				+奈
			
 
				+懶
			
 
				+癩
			
 
				+羅
			
 
				+蘿
			
 
				+螺
			
 
				+裸
			
 
				+邏
			
 
				+樂
			
 
				+洛
			
 
				+烙
			
 
				+珞
			
 
				+落
			
 
				+酪
			
 
				+駱
			
 
				+亂
			
 
				+卵
			
 
				+欄
			
 
				+爛
			
 
				+蘭
			
 
				+鸞
			
 
				+嵐
			
 
				+濫
			
 
				+藍
			
 
				+襤
			
 
				+拉
			
 
				+臘
			
 
				+蠟
			
 
				+廊
			
 
				+朗
			
 
				+浪
			
 
				+狼
			
 
				+郎
			
 
				+來
			
 
				+冷
			
 
				+勞
			
 
				+擄
			
 
				+櫓
			
 
				+爐
			
 
				+盧
			
 
				+老
			
 
				+蘆
			
 
				+虜
			
 
				+路
			
 
				+露
			
 
				+魯
			
 
				+鷺
			
 
				+碌
			
 
				+祿
			
 
				+綠
			
 
				+菉
			
 
				+錄
			
 
				+鹿
			
 
				+論
			
 
				+壟
			
 
				+弄
			
 
				+籠
			
 
				+聾
			
 
				+牢
			
 
				+磊
			
 
				+賂
			
 
				+雷
			
 
				+壘
			
 
				+屢
			
 
				+樓
			
 
				+淚
			
 
				+漏
			
 
				+累
			
 
				+縷
			
 
				+陋
			
 
				+勒
			
 
				+肋
			
 
				+凜
			
 
				+凌
			
 
				+稜
			
 
				+綾
			
 
				+菱
			
 
				+陵
			
 
				+讀
			
 
				+拏
			
 
				+樂
			
 
				+諾
			
 
				+丹
			
 
				+寧
			
 
				+怒
			
 
				+率
			
 
				+異
			
 
				+北
			
 
				+磻
			
 
				+便
			
 
				+復
			
 
				+不
			
 
				+泌
			
 
				+數
			
 
				+索
			
 
				+參
			
 
				+塞
			
 
				+省
			
 
				+葉
			
 
				+說
			
 
				+殺
			
 
				+辰
			
 
				+沈
			
 
				+拾
			
 
				+若
			
 
				+掠
			
 
				+略
			
 
				+亮
			
 
				+兩
			
 
				+凉
			
 
				+梁
			
 
				+糧
			
 
				+良
			
 
				+諒
			
 
				+量
			
 
				+勵
			
 
				+呂
			
 
				+女
			
 
				+廬
			
 
				+旅
			
 
				+濾
			
 
				+礪
			
 
				+閭
			
 
				+驪
			
 
				+麗
			
 
				+黎
			
 
				+力
			
 
				+曆
			
 
				+歷
			
 
				+轢
			
 
				+年
			
 
				+憐
			
 
				+戀
			
 
				+撚
			
 
				+漣
			
 
				+煉
			
 
				+璉
			
 
				+秊
			
 
				+練
			
 
				+聯
			
 
				+輦
			
 
				+蓮
			
 
				+連
			
 
				+鍊
			
 
				+列
			
 
				+劣
			
 
				+咽
			
 
				+烈
			
 
				+裂
			
 
				+說
			
 
				+廉
			
 
				+念
			
 
				+捻
			
 
				+殮
			
 
				+簾
			
 
				+獵
			
 
				+令
			
 
				+囹
			
 
				+寧
			
 
				+嶺
			
 
				+怜
			
 
				+玲
			
 
				+瑩
			
 
				+羚
			
 
				+聆
			
 
				+鈴
			
 
				+零
			
 
				+靈
			
 
				+領
			
 
				+例
			
 
				+禮
			
 
				+醴
			
 
				+隸
			
 
				+惡
			
 
				+了
			
 
				+僚
			
 
				+寮
			
 
				+尿
			
 
				+料
			
 
				+樂
			
 
				+燎
			
 
				+療
			
 
				+蓼
			
 
				+遼
			
 
				+龍
			
 
				+暈
			
 
				+阮
			
 
				+劉
			
 
				+杻
			
 
				+柳
			
 
				+流
			
 
				+溜
			
 
				+琉
			
 
				+留
			
 
				+硫
			
 
				+紐
			
 
				+類
			
 
				+六
			
 
				+戮
			
 
				+陸
			
 
				+倫
			
 
				+崙
			
 
				+淪
			
 
				+輪
			
 
				+律
			
 
				+慄
			
 
				+栗
			
 
				+率
			
 
				+隆
			
 
				+利
			
 
				+吏
			
 
				+履
			
 
				+易
			
 
				+李
			
 
				+梨
			
 
				+泥
			
 
				+理
			
 
				+痢
			
 
				+罹
			
 
				+裏
			
 
				+裡
			
 
				+里
			
 
				+離
			
 
				+匿
			
 
				+溺
			
 
				+吝
			
 
				+燐
			
 
				+璘
			
 
				+藺
			
 
				+隣
			
 
				+鱗
			
 
				+麟
			
 
				+林
			
 
				+淋
			
 
				+臨
			
 
				+立
			
 
				+笠
			
 
				+粒
			
 
				+狀
			
 
				+炙
			
 
				+識
			
 
				+什
			
 
				+茶
			
 
				+刺
			
 
				+切
			
 
				+度
			
 
				+拓
			
 
				+糖
			
 
				+宅
			
 
				+洞
			
 
				+暴
			
 
				+輻
			
 
				+行
			
 
				+降
			
 
				+見
			
 
				+廓
			
 
				+兀
			
 
				+嗀
			
 
				+﨎
			
 
				+﨏
			
 
				+塚
			
 
				+﨑
			
 
				+晴
			
 
				+﨓
			
 
				+﨔
			
 
				+凞
			
 
				+猪
			
 
				+益
			
 
				+礼
			
 
				+神
			
 
				+祥
			
 
				+福
			
 
				+靖
			
 
				+精
			
 
				+羽
			
 
				+﨟
			
 
				+蘒
			
 
				+﨡
			
 
				+諸
			
 
				+﨣
			
 
				+﨤
			
 
				+逸
			
 
				+都
			
 
				+﨧
			
 
				+﨨
			
 
				+﨩
			
 
				+飯
			
 
				+飼
			
 
				+館
			
 
				+鶴
			
 
				+郞
			
 
				+隷
			
 
				+侮
			
 
				+僧
			
 
				+免
			
 
				+勉
			
 
				+勤
			
 
				+卑
			
 
				+喝
			
 
				+嘆
			
 
				+器
			
 
				+塀
			
 
				+墨
			
 
				+層
			
 
				+屮
			
 
				+悔
			
 
				+慨
			
 
				+憎
			
 
				+懲
			
 
				+敏
			
 
				+既
			
 
				+暑
			
 
				+梅
			
 
				+海
			
 
				+渚
			
 
				+漢
			
 
				+煮
			
 
				+爫
			
 
				+琢
			
 
				+碑
			
 
				+社
			
 
				+祉
			
 
				+祈
			
 
				+祐
			
 
				+祖
			
 
				+祝
			
 
				+禍
			
 
				+禎
			
 
				+穀
			
 
				+突
			
 
				+節
			
 
				+練
			
 
				+縉
			
 
				+繁
			
 
				+署
			
 
				+者
			
 
				+臭
			
 
				+艹
			
 
				+艹
			
 
				+著
			
 
				+褐
			
 
				+視
			
 
				+謁
			
 
				+謹
			
 
				+賓
			
 
				+贈
			
 
				+辶
			
 
				+逸
			
 
				+難
			
 
				+響
			
 
				+頻
			
 
				+恵
			
 
				+𤋮
			
 
				+舘
			
 
				+﩮
			
 
				+﩯
			
 
				+並
			
 
				+况
			
 
				+全
			
 
				+侀
			
 
				+充
			
 
				+冀
			
 
				+勇
			
 
				+勺
			
 
				+喝
			
 
				+啕
			
 
				+喙
			
 
				+嗢
			
 
				+塚
			
 
				+墳
			
 
				+奄
			
 
				+奔
			
 
				+婢
			
 
				+嬨
			
 
				+廒
			
 
				+廙
			
 
				+彩
			
 
				+徭
			
 
				+惘
			
 
				+慎
			
 
				+愈
			
 
				+憎
			
 
				+慠
			
 
				+懲
			
 
				+戴
			
 
				+揄
			
 
				+搜
			
 
				+摒
			
 
				+敖
			
 
				+晴
			
 
				+朗
			
 
				+望
			
 
				+杖
			
 
				+歹
			
 
				+殺
			
 
				+流
			
 
				+滛
			
 
				+滋
			
 
				+漢
			
 
				+瀞
			
 
				+煮
			
 
				+瞧
			
 
				+爵
			
 
				+犯
			
 
				+猪
			
 
				+瑱
			
 
				+甆
			
 
				+画
			
 
				+瘝
			
 
				+瘟
			
 
				+益
			
 
				+盛
			
 
				+直
			
 
				+睊
			
 
				+着
			
 
				+磌
			
 
				+窱
			
 
				+節
			
 
				+类
			
 
				+絛
			
 
				+練
			
 
				+缾
			
 
				+者
			
 
				+荒
			
 
				+華
			
 
				+蝹
			
 
				+襁
			
 
				+覆
			
 
				+視
			
 
				+調
			
 
				+諸
			
 
				+請
			
 
				+謁
			
 
				+諾
			
 
				+諭
			
 
				+謹
			
 
				+變
			
 
				+贈
			
 
				+輸
			
 
				+遲
			
 
				+醙
			
 
				+鉶
			
 
				+陼
			
 
				+難
			
 
				+靖
			
 
				+韛
			
 
				+響
			
 
				+頋
			
 
				+頻
			
 
				+鬒
			
 
				+龜
			
 
				+𢡊
			
 
				+𢡄
			
 
				+𣏕
			
 
				+㮝
			
 
				+䀘
			
 
				+䀹
			
 
				+𥉉
			
 
				+𥳐
			
 
				+𧻓
			
 
				+齃
			
 
				+龎
			
--- a/format_convert/easyofd/easyofd/draw/__init__.py
+++ b/format_convert/easyofd/easyofd/draw/__init__.py
@@ -0,0 +1,23 @@
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+from reportlab.pdfbase import pdfmetrics
			
 
				+
			
 
				+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../../../../")
			
 
				+from format_convert.easyofd.easyofd.parser_ofd import *
			
 
				+
			
 
				+FONTS = ['宋体',"SWPMEH+SimSun",'SimSun','KaiTi','楷体',"STKAITI","SWLCQE+KaiTi",
			
 
				+         'Courier New','STSong-Light',"CourierNew","SWANVV+CourierNewPSMT",
			
 
				+         "CourierNewPSMT","BWSimKai","hei","黑体","SimHei","SWDKON+SimSun",
			
 
				+         "SWCRMF+CourierNewPSMT","SWHGME+KaiTi"]
			
 
				+
			
 
				+from .font_tools import FontTool
			
 
				+from .draw_pdf import DrawPDF
			
 
				+from .draw_ofd import OFDWrite
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    
			
--- a/format_convert/easyofd/easyofd/draw/draw_ofd.py
+++ b/format_convert/easyofd/easyofd/draw/draw_ofd.py
@@ -0,0 +1,290 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME: F:\code\easyofd\easyofd\draw
			
 
				+# CREATE_TIME: 2023-10-26
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# note:  写入 xml 目录并打包成ofd 文件
			
 
				+from datetime import datetime
			
 
				+from io import BytesIO
			
 
				+from typing import Optional
			
 
				+
			
 
				+from PIL import Image
			
 
				+from loguru import logger
			
 
				+
			
 
				+from .ofdtemplate import CurId, OFDTemplate, DocumentTemplate, DocumentResTemplate, PublicResTemplate, ContentTemplate, \
			
 
				+    OFDStructure
			
 
				+from .pdf_parse import DPFParser
			
 
				+
			
 
				+
			
 
				+class OFDWrite(object):
			
 
				+    """
			
 
				+    写入ofd 工具类
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, ):
			
 
				+        self.OP = 200 / 25.4
			
 
				+        # self.OP = 1
			
 
				+
			
 
				+    def build_ofd_entrance(self, id_obj: Optional[CurId] = None):
			
 
				+        """
			
 
				+        build_ofd_entrance
			
 
				+        """
			
 
				+        CreationDate = str(datetime.now())
			
 
				+        ofd_entrance = OFDTemplate(CreationDate=CreationDate, id_obj=id_obj)
			
 
				+        return ofd_entrance
			
 
				+
			
 
				+    def build_document(self, img_len, id_obj: Optional[CurId] = None, PhysicalBox: Optional[str] = "0 0 140 90"):
			
 
				+        """
			
 
				+        build_document
			
 
				+        """
			
 
				+        pages = []
			
 
				+
			
 
				+        for idx in range(img_len):
			
 
				+            pages.append(
			
 
				+                {
			
 
				+                    "@ID": f"{idx + 1}",
			
 
				+                    "@BaseLoc": f"Pages/Page_{idx}/Content.xml"
			
 
				+                }
			
 
				+            )
			
 
				+        document = DocumentTemplate(Page=pages, id_obj=id_obj, PhysicalBox=PhysicalBox)
			
 
				+        return document
			
 
				+
			
 
				+    def build_document_res(self, img_len: int = 0, id_obj: Optional[CurId] = None,
			
 
				+                           pfd_res_uuid_map: Optional[dict] = None):
			
 
				+        """
			
 
				+        build_document_res
			
 
				+        """
			
 
				+        MultiMedia = []
			
 
				+        DrawParams = []  # todo DrawParams 参数后面有空增加
			
 
				+        pfd_img = None
			
 
				+        if pfd_res_uuid_map:
			
 
				+            pfd_img = pfd_res_uuid_map.get("img")
			
 
				+
			
 
				+        if img_len and not pfd_res_uuid_map:
			
 
				+            for num in range(img_len):
			
 
				+                MultiMedia.append({
			
 
				+                    "@ID": 0,
			
 
				+                    "@Type": "Image",
			
 
				+                    "ofd:MediaFile": f"Image_{num}.jpg",
			
 
				+                    "res_uuid": f"{num}",
			
 
				+                })
			
 
				+        elif pfd_res_uuid_map and pfd_img:
			
 
				+            for res_uuid in pfd_img.keys():
			
 
				+                name = f"Image_{res_uuid}.jpg"
			
 
				+                MultiMedia.append({
			
 
				+                    "@ID": 0,
			
 
				+                    "@Type": "Image",
			
 
				+                    "ofd:MediaFile": name,
			
 
				+                    "res_uuid": res_uuid,
			
 
				+
			
 
				+                })
			
 
				+
			
 
				+        document_res = DocumentResTemplate(MultiMedia=MultiMedia, id_obj=id_obj)
			
 
				+        return document_res
			
 
				+
			
 
				+    def build_public_res(self, id_obj: CurId = None, pfd_res_uuid_map: dict = None):
			
 
				+        """
			
 
				+        build_public_res
			
 
				+        """
			
 
				+        fonts = []
			
 
				+
			
 
				+        pfd_font = None
			
 
				+        if pfd_res_uuid_map:
			
 
				+            pfd_font = pfd_res_uuid_map.get("font")
			
 
				+
			
 
				+        if pfd_res_uuid_map and pfd_font:
			
 
				+            for res_uuid, font in pfd_font.items():
			
 
				+                fonts.append({
			
 
				+                    "@ID": 0,
			
 
				+                    "@FontName": font,
			
 
				+                    "@FamilyName": font,  # 匹配替代字型
			
 
				+                    "res_uuid": res_uuid,
			
 
				+                    "@FixedWidth": "false",
			
 
				+                    "@Serif": "false",
			
 
				+                    "@Bold": "false",
			
 
				+                    "@Charset": "prc"
			
 
				+                })
			
 
				+        else:
			
 
				+            pass
			
 
				+
			
 
				+        public_res = PublicResTemplate(Font=fonts, id_obj=id_obj)
			
 
				+        return public_res
			
 
				+
			
 
				+    def build_content_res(self, pil_img_list=None, pdf_info_list=None, id_obj: CurId = None,
			
 
				+                          pfd_res_uuid_map: dict = None):
			
 
				+        """
			
 
				+        pil_img_list - >一张图片是一页
			
 
				+        content_res -> 写入 pdf 信息
			
 
				+        """
			
 
				+        PhysicalBox = None
			
 
				+        content_res_list = []
			
 
				+        if pil_img_list:
			
 
				+            for idx, pil_img in enumerate(pil_img_list):
			
 
				+                # print(pil_img)
			
 
				+                # print(idx, pil_img[1], pil_img[2])
			
 
				+                PhysicalBox = f"0 0 {pil_img[1]} {pil_img[2]}"
			
 
				+                ImageObject = [{
			
 
				+                    "@ID": 0,
			
 
				+                    "@CTM": f"{pil_img[1]} 0 0 {pil_img[2]} 0 0",
			
 
				+                    "@Boundary": f"0 0 {pil_img[1]} {pil_img[2]}",
			
 
				+                    "res_uuid": f"{idx}",  # 资源标识
			
 
				+                    "@ResourceID": f""
			
 
				+                }]
			
 
				+
			
 
				+                conten = ContentTemplate(PhysicalBox=PhysicalBox, ImageObject=ImageObject,
			
 
				+
			
 
				+                                         CGTransform=[], PathObject=[], TextObject=[], id_obj=id_obj)
			
 
				+                # print(conten)
			
 
				+                content_res_list.append(conten)
			
 
				+        elif pdf_info_list:  # 写入读取后的pdf 结果 # todo 图片id 需要关联得提前定义或者有其他方式反向对齐
			
 
				+
			
 
				+            for idx, content in enumerate(pdf_info_list):
			
 
				+                ImageObject = []
			
 
				+                TextObject = []
			
 
				+                PhysicalBox = pfd_res_uuid_map["other"]["page_size"][idx]
			
 
				+                PhysicalBox = f"0 0 {PhysicalBox[0]} {PhysicalBox[1]}"  # page_size 没有的话使用document 里面的
			
 
				+                for block in content:
			
 
				+                    # print(block)
			
 
				+
			
 
				+                    bbox = block['bbox']
			
 
				+                    x0, y0, length, height = bbox[0] / self.OP, bbox[1] / self.OP, (bbox[2] - bbox[0]) / self.OP, (
			
 
				+                            bbox[3] - bbox[1]) / self.OP
			
 
				+                    if block["type"] == "text":
			
 
				+
			
 
				+                        count = len(block.get("text"))
			
 
				+
			
 
				+                        TextObject.append({
			
 
				+                            "@ID": 0,
			
 
				+                            "res_uuid": block.get("res_uuid"),  # 资源标识
			
 
				+                            "@Font": "",
			
 
				+                            "ofd:FillColor": {"Value": "156 82 35"},
			
 
				+
			
 
				+                            "ofd:TextCode": {
			
 
				+                                "#text": block.get("text"),
			
 
				+                                "@X": "0",
			
 
				+                                "@Y": f"{block.get('size') / self.OP}",
			
 
				+                                "@DeltaX": f"g {count - 1} {length / count}"
			
 
				+                            },
			
 
				+
			
 
				+                            "@size": block.get("size") / self.OP,
			
 
				+                            "@Boundary": f"{x0} {y0} {length} {height}",
			
 
				+
			
 
				+                        })
			
 
				+                    elif block["type"] == "img":
			
 
				+                        ImageObject.append({
			
 
				+                            "@ID": 0,
			
 
				+                            "res_uuid": block.get("res_uuid"),  # 资源标识
			
 
				+
			
 
				+                            "@Boundary": f"{x0} {y0} {length} {height}",
			
 
				+                            "@ResourceID": f""  # 需要关联public res 里面的结果
			
 
				+
			
 
				+                        })
			
 
				+
			
 
				+                # for i in content:
			
 
				+                #     if i["type"] == "img":
			
 
				+                #         ImageObject.append(i)
			
 
				+                #     elif i["type"] == "text":
			
 
				+                #         TextObject.append(i)
			
 
				+
			
 
				+                conten = ContentTemplate(PhysicalBox=PhysicalBox, ImageObject=ImageObject,
			
 
				+
			
 
				+                                         CGTransform=[], PathObject=[], TextObject=TextObject, id_obj=id_obj)
			
 
				+                # print(conten)
			
 
				+                content_res_list.append(conten)
			
 
				+        else:
			
 
				+            pass
			
 
				+        return content_res_list
			
 
				+
			
 
				+    def pil_2_bytes(self, image):
			
 
				+        """"""
			
 
				+        # 创建一个 BytesIO 对象
			
 
				+        img_bytesio = BytesIO()
			
 
				+
			
 
				+        # 将图像保存到 BytesIO 对象
			
 
				+        image.save(img_bytesio, format='PNG')  # 你可以根据需要选择其他图像格式
			
 
				+
			
 
				+        # 获取 BytesIO 对象中的字节
			
 
				+        img_bytes = img_bytesio.getvalue()
			
 
				+
			
 
				+        # 关闭 BytesIO 对象
			
 
				+        img_bytesio.close()
			
 
				+        return img_bytes
			
 
				+
			
 
				+    def __call__(self, pdf_bytes=None, pil_img_list=None, optional_text=False):
			
 
				+        """
			
 
				+        input pdf | imgs if pdf  >optional_text or not
			
 
				+        0 解析pdf文件
			
 
				+        1 构建必要的ofd template
			
 
				+        2 转化为 ofd
			
 
				+        """
			
 
				+        pdf_obj = DPFParser()
			
 
				+        page_pil_img_list = None
			
 
				+
			
 
				+        # 插入图片ofd
			
 
				+        if pil_img_list:  # 读取 图片
			
 
				+            page_pil_img_list = [(self.pil_2_bytes(_img), _img.size[0] / self.OP, _img.size[1] / self.OP) for _img in
			
 
				+                                 pil_img_list]
			
 
				+        else:  # 读取 pdf 转图片
			
 
				+            if optional_text:  # 生成可编辑ofd:
			
 
				+                pdf_info_list, pfd_res_uuid_map = pdf_obj.extract_text_with_details(pdf_bytes)  # 解析pdf
			
 
				+                # logger.debug(f"pdf_info_list: {pdf_info_list} \n pfd_res_uuid_map {pfd_res_uuid_map}")
			
 
				+            else:
			
 
				+                img_list = pdf_obj.to_img(pdf_bytes)
			
 
				+                page_pil_img_list = [(self.pil_2_bytes(Image.frombytes("RGB", [_img.width, _img.height],
			
 
				+                                                                       _img.samples)), _img.width / self.OP,
			
 
				+                                      _img.height / self.OP) for _img in img_list]
			
 
				+
			
 
				+        id_obj = CurId()
			
 
				+
			
 
				+        if page_pil_img_list:  # img 内容转ofd
			
 
				+            res_static = {}  # 图片资源
			
 
				+            pfd_res_uuid_map = {"img": {}}
			
 
				+            PhysicalBox = f"0 0 {page_pil_img_list[0][1]} {page_pil_img_list[0][2]}"
			
 
				+            for idx, pil_img_tuple in enumerate(page_pil_img_list):
			
 
				+                pfd_res_uuid_map["img"][f"{idx}"] = pil_img_tuple[0]
			
 
				+                res_static[f"Image_{idx}.jpg"] = pil_img_tuple[0]
			
 
				+            ofd_entrance = self.build_ofd_entrance(id_obj=id_obj)
			
 
				+            document = self.build_document(len(page_pil_img_list), id_obj=id_obj, PhysicalBox=PhysicalBox)
			
 
				+            public_res = self.build_public_res(id_obj=id_obj)
			
 
				+            document_res = self.build_document_res(len(page_pil_img_list), id_obj=id_obj,
			
 
				+                                                   pfd_res_uuid_map=pfd_res_uuid_map)
			
 
				+
			
 
				+            content_res_list = self.build_content_res(page_pil_img_list, id_obj=id_obj,
			
 
				+                                                      pfd_res_uuid_map=pfd_res_uuid_map)
			
 
				+
			
 
				+
			
 
				+        else:
			
 
				+            #  生成的文档结构对象需要传入id实例
			
 
				+            ofd_entrance = self.build_ofd_entrance(id_obj=id_obj)
			
 
				+            document = self.build_document(len(pdf_info_list), id_obj=id_obj)
			
 
				+            public_res = self.build_public_res(id_obj=id_obj, pfd_res_uuid_map=pfd_res_uuid_map)
			
 
				+            document_res = self.build_document_res(len(pdf_info_list), id_obj=id_obj, pfd_res_uuid_map=pfd_res_uuid_map)
			
 
				+            content_res_list = self.build_content_res(pdf_info_list=pdf_info_list, id_obj=id_obj,
			
 
				+                                                      pfd_res_uuid_map=pfd_res_uuid_map)
			
 
				+
			
 
				+            res_static = {}  # 图片资源
			
 
				+
			
 
				+            print("pfd_res_uuid_map", pfd_res_uuid_map)
			
 
				+            img_dict = pfd_res_uuid_map.get("img")
			
 
				+            if img_dict:
			
 
				+                for key, v_io in img_dict.items():
			
 
				+                    res_static[f"Image_{key}.jpg"] = v_io.getvalue()
			
 
				+
			
 
				+        # 生成 ofd 文件
			
 
				+        ofd_byte = OFDStructure("123", ofd=ofd_entrance, document=document, public_res=public_res,
			
 
				+                                document_res=document_res, content_res=content_res_list, res_static=res_static)(
			
 
				+            test=True)
			
 
				+        return ofd_byte
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    pdf_p = r"D:\renodoc\技术栈\GBT_33190-2016_电子文件存储与交换格式版式文档.pdf"
			
 
				+    pdf_p = r"F:\code\easyofd\test"
			
 
				+    with open(pdf_p, "rb") as f:
			
 
				+        content = f.read()
			
 
				+
			
 
				+    ofd_content = OFDWrite()(content)
			
 
				+
			
 
				+    with open("ofd.ofd", "wb") as f:
			
 
				+        f.write(ofd_content)
			
--- a/format_convert/easyofd/easyofd/draw/draw_pdf.py
+++ b/format_convert/easyofd/easyofd/draw/draw_pdf.py
@@ -0,0 +1,1178 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME: E:\code\easyofd\easyofd\draw
			
 
				+# CREATE_TIME: 2023-08-10
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE:  绘制pdf
			
 
				+import base64
			
 
				+import math
			
 
				+import os
			
 
				+import re
			
 
				+import sys
			
 
				+import traceback
			
 
				+from io import BytesIO
			
 
				+
			
 
				+from PIL import Image as PILImage, Image, ImageFont, ImageDraw
			
 
				+from fontTools.ttLib import TTFont
			
 
				+from loguru import logger
			
 
				+from reportlab.lib.pagesizes import A4
			
 
				+from reportlab.lib.utils import ImageReader
			
 
				+from reportlab.pdfgen import canvas
			
 
				+
			
 
				+from format_convert.utils import special_font_to_normal, image_resize_by_ratio
			
 
				+
			
 
				+sys.path.append(os.path.dirname(__file__) + "/../../../../")
			
 
				+from format_convert.easyofd.easyofd.draw.font_tools import FontTool
			
 
				+from .find_seal_img import SealExtract
			
 
				+
			
 
				+
			
 
				+# print(reportlab_fonts)
			
 
				+class DrawPDF():
			
 
				+    """
			
 
				+    ofd 解析结果 绘制pdf
			
 
				+    OP ofd 单位转换
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, data, *args, **kwargs):
			
 
				+        assert data, "未输入ofd解析结果"
			
 
				+        self.data = data
			
 
				+        self.author = "renoyuan"
			
 
				+        self.OP = 200 / 25.4
			
 
				+        # self.OP = 1
			
 
				+        self.pdf_uuid_name = self.data[0]["pdf_name"]
			
 
				+        self.pdf_io = BytesIO()
			
 
				+        self.SupportImgType = ("JPG", "JPEG", "PNG")
			
 
				+        self.init_font = "宋体"
			
 
				+        self.font_tool = FontTool()
			
 
				+        self.page_need_to_image_dict = {}
			
 
				+
			
 
				+    def draw_lines(my_canvas):
			
 
				+        """
			
 
				+        draw_line
			
 
				+        """
			
 
				+        my_canvas.setLineWidth(.3)
			
 
				+
			
 
				+        start_y = 710
			
 
				+        my_canvas.line(30, start_y, 580, start_y)
			
 
				+
			
 
				+        for x in range(10):
			
 
				+            start_y -= 10
			
 
				+            my_canvas.line(30, start_y, 580, start_y)
			
 
				+
			
 
				+    def gen_empty_pdf(self):
			
 
				+        """
			
 
				+        """
			
 
				+        c = canvas.Canvas(self.pdf_io)
			
 
				+        c.setPageSize(A4)
			
 
				+        c.setFont(self.init_font, 20)
			
 
				+        c.drawString(0, 210, "ofd 格式错误,不支持解析", mode=1)
			
 
				+        c.save()
			
 
				+
			
 
				+    # 单个字符偏移量计算
			
 
				+    def cmp_offset(self, pos, offset, DeltaRule, text, CTM_info, dire="X") -> list:
			
 
				+        """
			
 
				+        pos 文本框x|y 坐标 
			
 
				+        offset 第一个字符的X|Y 
			
 
				+        DeltaRule 偏移量规则
			
 
				+        resize 字符坐标缩放
			
 
				+        返回 x|y  字符位置list 
			
 
				+        """
			
 
				+        if CTM_info and dire == "X":
			
 
				+            resize = CTM_info.get("resizeX")
			
 
				+            rotate = CTM_info.get("rotateX")
			
 
				+            move = CTM_info.get("moveX")
			
 
				+        elif CTM_info and dire == "Y":
			
 
				+            resize = CTM_info.get("resizeY")
			
 
				+            rotate = CTM_info.get("rotateY")
			
 
				+            move = CTM_info.get("moveY")
			
 
				+        else:
			
 
				+            resize = 1
			
 
				+            rotate = 0
			
 
				+            move = 0
			
 
				+
			
 
				+        # print(f"resize is {resize}")
			
 
				+        char_pos = float(pos if pos else 0) + (float(offset if offset else 0) + move) * resize
			
 
				+        pos_list = []
			
 
				+        pos_list.append(char_pos)  # 放入第一个字符
			
 
				+        offsets = [i for i in DeltaRule.split(" ")]
			
 
				+
			
 
				+        if "g" in DeltaRule:  # g 代表多个元素
			
 
				+            g_no = None
			
 
				+            for _no, offset_i in enumerate(offsets):
			
 
				+
			
 
				+                if offset_i == "g":
			
 
				+                    g_no = _no
			
 
				+                    for j in range(int(offsets[(g_no + 1)])):
			
 
				+                        char_pos += float(offsets[(g_no + 2)])
			
 
				+                        pos_list.append(char_pos)
			
 
				+
			
 
				+                elif offset_i and offset_i != "g":
			
 
				+                    if g_no == None:
			
 
				+                        char_pos += float(offset_i) * resize
			
 
				+                        pos_list.append(char_pos)
			
 
				+                    elif (int(_no) > int(g_no + 2)) and g_no != None:
			
 
				+                        # print(f"offset_i is {offset_i}")
			
 
				+                        char_pos += float(offset_i) * resize
			
 
				+                        pos_list.append(char_pos)
			
 
				+
			
 
				+        elif not DeltaRule:  # 没有字符偏移量 一般单字符
			
 
				+            pos_list = []
			
 
				+            for i in range(len(text)):
			
 
				+                pos_list.append(char_pos)
			
 
				+        else:  # 有字符偏移量
			
 
				+            for i in offsets:
			
 
				+                if not i:
			
 
				+                    char_pos += 0
			
 
				+                else:
			
 
				+                    char_pos += float(i) * resize
			
 
				+                pos_list.append(char_pos)
			
 
				+
			
 
				+        return pos_list
			
 
				+
			
 
				+    def draw_chars_old(self, canvas, text_list, fonts, page_size):
			
 
				+        """写入字符"""
			
 
				+        c = canvas
			
 
				+        for line_dict in text_list:
			
 
				+            # TODO 写入前对于正文内容整体序列化一次 方便 查看最后输入值 对于最终 格式先
			
 
				+            text = line_dict.get("text")
			
 
				+            # font_info = fonts.get(line_dict.get("font"), {})
			
 
				+            # if font_info:
			
 
				+            #     font_name = font_info.get("FontName", "")
			
 
				+            # else:
			
 
				+            #     font_name = self.init_font
			
 
				+            # print(f"font_name:{font_name}")
			
 
				+
			
 
				+            # TODO 判断是否通用已有字体 否则匹配相近字体使用
			
 
				+            # if font_name not in self.font_tool.FONTS:
			
 
				+            #     font_name = self.font_tool.FONTS[0]
			
 
				+            font_name = self.init_font
			
 
				+
			
 
				+            font = self.font_tool.normalize_font_name(font_name)
			
 
				+            # print(f"font_name:{font_name} font:{font}")
			
 
				+
			
 
				+            try:
			
 
				+                c.setFont(font, line_dict["size"] * self.OP)
			
 
				+            except KeyError as key_error:
			
 
				+                logger.error(f"{key_error}")
			
 
				+                font = self.font_tool.FONTS[0]
			
 
				+                c.setFont(font, line_dict["size"] * self.OP)
			
 
				+            # 原点在页面的左下角 
			
 
				+            color = line_dict.get("color", [0, 0, 0])
			
 
				+            if len(color) < 3:
			
 
				+                color = [0, 0, 0]
			
 
				+
			
 
				+            c.setFillColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
			
 
				+            c.setStrokeColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
			
 
				+
			
 
				+            DeltaX = line_dict.get("DeltaX", "")
			
 
				+            DeltaY = line_dict.get("DeltaY", "")
			
 
				+            # print("DeltaX",DeltaX)
			
 
				+            X = line_dict.get("X", "")
			
 
				+            Y = line_dict.get("Y", "")
			
 
				+            CTM = line_dict.get("CTM", "")  # 因为ofd 增加这个字符缩放
			
 
				+            resizeX = 1
			
 
				+            resizeY = 1
			
 
				+            # CTM =None # 有的数据不使用这个CTM
			
 
				+            CTMS = None
			
 
				+            if CTM:
			
 
				+                CTMS = CTM.split(" ")
			
 
				+
			
 
				+            if CTM and CTMS and len(CTMS) == 6:
			
 
				+                CTM_info = {
			
 
				+                    "resizeX": float(CTMS[0]),
			
 
				+                    "rotateX": float(CTMS[1]),
			
 
				+                    "rotateY": float(CTMS[2]),
			
 
				+                    "resizeY": float(CTMS[3]),
			
 
				+                    "moveX": float(CTMS[4]),
			
 
				+                    "moveY": float(CTMS[5]),
			
 
				+
			
 
				+                }
			
 
				+
			
 
				+            else:
			
 
				+                CTM_info = {}
			
 
				+            x_list = self.cmp_offset(line_dict.get("pos")[0], X, DeltaX, text, CTM_info, dire="X")
			
 
				+            y_list = self.cmp_offset(line_dict.get("pos")[1], Y, DeltaY, text, CTM_info, dire="Y")
			
 
				+
			
 
				+            # print("x_list",x_list)
			
 
				+            # print("y_list",y_list)
			
 
				+            # print("Y",page_size[3])
			
 
				+            # print("x",page_size[2])
			
 
				+            # if line_dict.get("Glyphs_d") and  FontFilePath.get(line_dict["font"])  and font_f not in FONTS:
			
 
				+            if False:  # 对于自定义字体 写入字形 drawPath 性能差暂时作废
			
 
				+                Glyphs = [int(i) for i in line_dict.get("Glyphs_d").get("Glyphs").split(" ")]
			
 
				+                for idx, Glyph_id in enumerate(Glyphs):
			
 
				+                    _cahr_x = float(x_list[idx]) * self.OP
			
 
				+                    _cahr_y = (float(page_size[3]) - (float(y_list[idx]))) * self.OP
			
 
				+                    imageFile = draw_Glyph(FontFilePath.get(line_dict["font"]), Glyph_id, text[idx])
			
 
				+
			
 
				+                    # font_img_info.append((FontFilePath.get(line_dict["font"]), Glyph_id,text[idx],_cahr_x,_cahr_y,-line_dict["size"]*Op*2,line_dict["size"]*Op*2))
			
 
				+                    c.drawImage(imageFile, _cahr_x, _cahr_y, -line_dict["size"] * self.OP * 2,
			
 
				+                                line_dict["size"] * self.OP * 2)
			
 
				+            else:
			
 
				+                if len(text) > len(x_list) or len(text) > len(y_list):
			
 
				+                    text = re.sub("[^\u4e00-\u9fa5]", "", text)
			
 
				+                try:
			
 
				+                    # 按行写入  最后一个字符y  算出来大于 y轴  最后一个字符x  算出来大于 x轴 
			
 
				+                    if y_list[-1] * self.OP > page_size[3] * self.OP or x_list[-1] * self.OP > page_size[2] * self.OP or \
			
 
				+                            x_list[-1] < 0 or y_list[-1] < 0:
			
 
				+                        # if True:
			
 
				+                        # print("line wtite")
			
 
				+                        x_p = abs(float(X)) * self.OP
			
 
				+                        y_p = abs(float(page_size[3]) - (float(Y))) * self.OP
			
 
				+                        print('text, x_p, y_p', text, x_p, y_p)
			
 
				+                        c.drawString(x_p, y_p, text, mode=0)  # mode=3 文字不可见 0可見
			
 
				+
			
 
				+                        # text_write.append((x_p,  y_p, text))
			
 
				+                    # 按字符写入
			
 
				+                    else:
			
 
				+                        for char_id, _char in enumerate(text):
			
 
				+                            if len(x_list) > char_id:
			
 
				+                                # print("char wtite")
			
 
				+                                font_size = line_dict["size"] * self.OP * resizeX
			
 
				+                                c.setFont(font, line_dict["size"] * self.OP * resizeX)
			
 
				+                                _char_x = float(x_list[char_id]) * self.OP
			
 
				+                                _char_y = (float(page_size[3]) - (float(y_list[char_id]))) * self.OP
			
 
				+                                # print(_cahr_x,  _cahr_y, _cahr_)
			
 
				+                                print('_cahr_, _char_x, _char_y', _char, _char_x, _char_y, font_size)
			
 
				+                                c.drawString(_char_x, _char_y, _char, mode=0)  # mode=3 文字不可见 0可見
			
 
				+                                break
			
 
				+                            else:
			
 
				+                                pass
			
 
				+                                # logger.debug(f"match {_cahr_} pos error \n{text} \n{x_list}")
			
 
				+                            # text_write.append((_cahr_x,  _cahr_y, _cahr_))
			
 
				+                        break
			
 
				+                except Exception as e:
			
 
				+                    logger.error(f"{e}")
			
 
				+                    traceback.print_exc()
			
 
				+
			
 
				+    def draw_chars(self, canvas, text_list, fonts, page_size, pdf_page_size):
			
 
				+        """写入字符"""
			
 
				+        for line_dict in text_list:
			
 
				+            # TODO 写入前对于正文内容整体序列化一次 方便 查看最后输入值 对于最终 格式先
			
 
				+            # print('line_dict', line_dict)
			
 
				+            text = line_dict.get("text")
			
 
				+            text_size = line_dict.get("size")
			
 
				+            if not text_size:
			
 
				+                print('draw_chars not text_size', text)
			
 
				+                return
			
 
				+
			
 
				+            # 变换矩阵
			
 
				+            ctm = line_dict.get("CTM", '')
			
 
				+            ctm = self.get_ctm(ctm)
			
 
				+            a, b, c, d, e, f = ctm
			
 
				+            # 计算水平和垂直方向的缩放因子的平均值
			
 
				+            font_scale = (a + d) / 2
			
 
				+
			
 
				+            color = line_dict.get("color", [0, 0, 0])
			
 
				+            if len(color) < 3:
			
 
				+                color = [0, 0, 0]
			
 
				+            canvas.setFillColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
			
 
				+            # c.setStrokeColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
			
 
				+
			
 
				+            # 文本框范围
			
 
				+            boundary = line_dict.get("pos")
			
 
				+            if len(boundary) != 4:
			
 
				+                print('draw_chars not boundary', text, boundary)
			
 
				+                return
			
 
				+            left, top, width, height = boundary
			
 
				+
			
 
				+            # 根据delta_x判断有重复文本
			
 
				+            delta_x = line_dict.get("DeltaX", "")
			
 
				+            delta_y = line_dict.get("DeltaY", "")
			
 
				+            g_cnt = re.findall('g', delta_x)
			
 
				+            if len(g_cnt) >= 2:
			
 
				+                split_index = len(text) / 2
			
 
				+                if text[:int(split_index)] == text[int(split_index):]:
			
 
				+                    text2 = text[:int(split_index)]
			
 
				+                    print('len(g_cnt) >= 2', g_cnt, text, '->', text2)
			
 
				+                    text = text2
			
 
				+
			
 
				+            # 文字相对与boundary的偏移
			
 
				+            x = line_dict.get("X", "")
			
 
				+            y = line_dict.get("Y", "")
			
 
				+            if "" in [x, y]:
			
 
				+                print('draw_chars not x or not y', text, x, y)
			
 
				+                return
			
 
				+            x, y = float(x) * font_scale, float(y) * font_scale
			
 
				+
			
 
				+            font_name = self.init_font
			
 
				+            font = self.font_tool.normalize_font_name(font_name)
			
 
				+
			
 
				+            # boundary, x, y 计算实际坐标
			
 
				+            actual_left = left + x
			
 
				+            actual_right = actual_left + width
			
 
				+            actual_top = top + y
			
 
				+            actual_bottom = actual_top + y
			
 
				+
			
 
				+            # print('actual_left, actual_top', text, actual_left, actual_top)
			
 
				+
			
 
				+            # ctm, text_size 计算字体大小
			
 
				+            actual_size = text_size * font_scale
			
 
				+
			
 
				+            canvas.setFont(font, actual_size * self.OP)
			
 
				+
			
 
				+            # print('actual_bottom, y', actual_bottom, y)
			
 
				+            # ofd原点在左上角，pdf原点在左下角
			
 
				+            try:
			
 
				+                # print('text111', text, actual_left * self.OP, pdf_page_size[3] - actual_bottom * self.OP)
			
 
				+                # 按行写入
			
 
				+                canvas.drawString(actual_left * self.OP,
			
 
				+                                  pdf_page_size[3] - actual_top * self.OP,
			
 
				+                                  text, mode=0)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                logger.error(f"{e}")
			
 
				+                traceback.print_exc()
			
 
				+
			
 
				+    def draw_odf_char_on_image(self, line_dict, img, pos, ofd_page_size):
			
 
				+        text = line_dict.get("text")
			
 
				+        text_size = line_dict.get("size")
			
 
				+        if not text_size:
			
 
				+            print('get_odf_char_info not text_size', text)
			
 
				+            return
			
 
				+
			
 
				+        # 变换矩阵
			
 
				+        ctm = line_dict.get("CTM", '')
			
 
				+        ctm = self.get_ctm(ctm)
			
 
				+        a, b, c, d, e, f = ctm
			
 
				+        # 计算水平和垂直方向的缩放因子的平均值
			
 
				+        font_scale = (a + d) / 2
			
 
				+
			
 
				+        color = line_dict.get("color", [0, 0, 0])
			
 
				+        if len(color) < 3:
			
 
				+            color = (0, 0, 0)
			
 
				+        else:
			
 
				+            color = tuple([int(x) for x in color])
			
 
				+        # print('color', color)
			
 
				+
			
 
				+        # 文本框范围
			
 
				+        boundary = line_dict.get("pos")
			
 
				+        if len(boundary) != 4:
			
 
				+            print('get_odf_char_info not boundary', text, boundary)
			
 
				+            return
			
 
				+        left, top, width, height = boundary
			
 
				+
			
 
				+        # 文字相对与boundary的偏移，y小于size的话会显示不完全
			
 
				+        x = line_dict.get("X", "")
			
 
				+        y = line_dict.get("Y", "")
			
 
				+        # print('x, y', x, y)
			
 
				+        if "" in [x, y]:
			
 
				+            print('get_odf_char_info not x or not y', text, x, y)
			
 
				+            return
			
 
				+        x, y = float(x) * a, float(y) * d
			
 
				+
			
 
				+        # boundary, x, y 计算实际坐标
			
 
				+        actual_left = left
			
 
				+        actual_right = actual_left + x
			
 
				+        actual_top = top
			
 
				+        actual_bottom = actual_top + y
			
 
				+
			
 
				+        # print('actual_left', actual_left, ofd_page_size[2], pos[2])
			
 
				+        # actual_left = actual_left / ofd_page_size[2] * pos[2]
			
 
				+        # print('actual_left2', actual_left, ofd_page_size[2], pos[2])
			
 
				+        # actual_top = actual_top / ofd_page_size[3] * pos[3]
			
 
				+
			
 
				+        # actual_bottom = bottom + y
			
 
				+        # actual_top = actual_bottom + y
			
 
				+
			
 
				+        # print('actual_left, actual_top', text, actual_left, actual_top)
			
 
				+
			
 
				+        # ctm, text_size 计算字体大小
			
 
				+        actual_size = text_size * font_scale
			
 
				+        actual_size = int(actual_size * img.size[0] / pos[2])
			
 
				+
			
 
				+        left_top_point = [actual_left * img.size[0] / pos[2], actual_top * img.size[1] / pos[3]]
			
 
				+        left_top_point = [int(x) for x in left_top_point]
			
 
				+        draw = ImageDraw.Draw(img)
			
 
				+        font = ImageFont.truetype(os.path.dirname(__file__) + '/simsun.ttc', actual_size)
			
 
				+
			
 
				+        # print('text left_top_point, actual_size', text, left_top_point, actual_size)
			
 
				+        # print('img.size', img.size)
			
 
				+
			
 
				+        draw.text(left_top_point, text, font=font, fill=color)
			
 
				+        return img
			
 
				+
			
 
				+    def compute_ctm(self, CTM, x1, y1, img_width, img_height):
			
 
				+        """待定方法"""
			
 
				+        a, b, c, d, e, f = CTM.split(" ")
			
 
				+        a, b, c, d, e, f = float(a), float(b), float(c), float(d), float(e), float(f)
			
 
				+        # 定义变换矩阵的元素
			
 
				+
			
 
				+        # 计算原始矩形的宽和高
			
 
				+        x2 = x1 + img_width
			
 
				+        y2 = y1 + img_height
			
 
				+        print(f"ori x1 {x1} y1 {y1} x2 {x2} y2 {y2} img_width {img_width} img_height {img_height}")
			
 
				+        a = a / 10
			
 
				+        d = d / 10
			
 
				+        # 对左上角和右下角点进行变换
			
 
				+        x1_new = a * x1 + c * y1 + (e)
			
 
				+        y1_new = b * x1 + d * y1 + (f)
			
 
				+        x2_new = a * x2 + c * y2 + (e)
			
 
				+        y2_new = b * x2 + d * y2 + (f)
			
 
				+        print(f"x1_new {x1_new} y1_new {y1_new} x2_new {x2_new} y2_new {y2_new}")
			
 
				+        # 计算变换后矩形的宽和高
			
 
				+        w_new = x2_new - x1_new
			
 
				+        h_new = y2_new - y1_new
			
 
				+
			
 
				+        print(f"原始矩形宽度: {img_width}, 高度: {img_height}")
			
 
				+        print(f"变换后矩形宽度: {w_new}, 高度: {h_new}")
			
 
				+        return x1_new, y1_new, w_new, h_new
			
 
				+
			
 
				+    def get_ctm(self, ctm):
			
 
				+        default_ctm = (1, 0, 0, 1, 0, 0)
			
 
				+        if not ctm:
			
 
				+            # print('get_ctm no ctm!', ctm)
			
 
				+            return default_ctm
			
 
				+        ctm = ctm.split(" ")
			
 
				+        if len(ctm) != 6:
			
 
				+            print('get_ctm len(ctm) != 6', ctm)
			
 
				+            return default_ctm
			
 
				+        ctm = [float(x) for x in ctm]
			
 
				+        # a, b, c, d, e, f = ctm
			
 
				+        return ctm
			
 
				+
			
 
				+    def draw_img_old(self, canvas, img_list, images, page_size):
			
 
				+        """写入图片"""
			
 
				+        c = canvas
			
 
				+        for img_d in img_list:
			
 
				+            image = images.get(img_d["ResourceID"])
			
 
				+
			
 
				+            if not image or image.get("suffix").upper() not in self.SupportImgType:
			
 
				+                continue
			
 
				+
			
 
				+            imgbyte = base64.b64decode(image.get('imgb64'))
			
 
				+            if not imgbyte:
			
 
				+                logger.error(f"{image['fileName']} is null")
			
 
				+                continue
			
 
				+
			
 
				+            img = PILImage.open(BytesIO(imgbyte))
			
 
				+            img_width, img_height = img.size
			
 
				+            # img_width = img_width / self.OP *25.4
			
 
				+            # img_height = img_height / self.OP *25.4
			
 
				+            info = img.info
			
 
				+            # print( f"ing info dpi {info.get('dpi')}")
			
 
				+            # print(img_width, img_height)
			
 
				+            imgReade = ImageReader(img)
			
 
				+            CTM = img_d.get('CTM')
			
 
				+            # print("CTM", CTM)
			
 
				+
			
 
				+            wrap_pos = image.get("wrap_pos")
			
 
				+            # print("wrap_pos", wrap_pos)
			
 
				+            pos = img_d.get('pos')
			
 
				+            # print("pos", pos)
			
 
				+            CTM = None
			
 
				+            if CTM and not wrap_pos and page_size == pos:
			
 
				+                x1_new, y1_new, w_new, h_new = self.compute_ctm(CTM, 0, 0, img_width, img_height)
			
 
				+                pdf_pos = [pos[0] * self.OP, pos[1] * self.OP, pos[2] * self.OP, pos[3] * self.OP]
			
 
				+                print(f"pos: {pos} pdf_pos: {pdf_pos}")
			
 
				+
			
 
				+                x1_new = (pos[0] + x1_new) * self.OP
			
 
				+                y1_new = (page_size[3] - y1_new) * self.OP
			
 
				+                if w_new > pdf_pos[2]:
			
 
				+                    w_new = pdf_pos[2]
			
 
				+                if h_new > pdf_pos[3]:
			
 
				+                    h_new = pdf_pos[3]
			
 
				+                print(f"写入 {x1_new} {y1_new} {w_new} {-h_new}")
			
 
				+                c.drawImage(imgReade, x1_new, y1_new, w_new, -h_new, 'auto')
			
 
				+            else:
			
 
				+                x_offset = 0
			
 
				+                y_offset = 0
			
 
				+
			
 
				+                x = (pos[0] + x_offset) * self.OP
			
 
				+                y = (page_size[3] - (pos[1] + y_offset)) * self.OP
			
 
				+                if wrap_pos:
			
 
				+                    x = x + (wrap_pos[0] * self.OP)
			
 
				+                    y = y - (wrap_pos[1] * self.OP)
			
 
				+                    w = img_d.get('pos')[2] * self.OP
			
 
				+                    h = -img_d.get('pos')[3] * self.OP
			
 
				+
			
 
				+                    # print(x, y, w, h)
			
 
				+                    c.drawImage(imgReade, x, y, w, h, 'auto')
			
 
				+                elif pos:
			
 
				+                    # print(f"page_size == pos :{page_size == pos} ")
			
 
				+                    x = pos[0] * self.OP
			
 
				+                    y = (page_size[3] - pos[1]) * self.OP
			
 
				+                    w = pos[2] * self.OP
			
 
				+                    h = -pos[3] * self.OP
			
 
				+
			
 
				+                    # print("pos",pos[0],pos[1],pos[2]* self.OP,pos[3]* self.OP)
			
 
				+                    # print(x2_new, -y2_new, w_new, h_new,)
			
 
				+
			
 
				+                    x, y = 0, 0
			
 
				+                    w, h = img.size
			
 
				+
			
 
				+                    print('x, y, w, h', x, y, w, h)
			
 
				+
			
 
				+                    c.drawImage(imgReade, x, y, w, h, 'auto')
			
 
				+                    # c.drawImage(imgReade,x2_new, -y2_new, w_new, h_new, 'auto')
			
 
				+
			
 
				+    def draw_img(self, canvas, img_list, images, ofd_page_size, pdf_page_size, ofd_to_pdf_ratio):
			
 
				+        """写入图片"""
			
 
				+        c = canvas
			
 
				+        for img_d in img_list:
			
 
				+            image = images.get(img_d["ResourceID"])
			
 
				+            if not image or image.get("suffix").upper() not in self.SupportImgType:
			
 
				+                print('img_d["ResourceID"]', img_d["ResourceID"])
			
 
				+                logger.error(f"not image")
			
 
				+                continue
			
 
				+
			
 
				+            imgbyte = base64.b64decode(image.get('imgb64'))
			
 
				+            if not imgbyte:
			
 
				+                logger.error(f"{image['fileName']} is null")
			
 
				+                continue
			
 
				+
			
 
				+            img = PILImage.open(BytesIO(imgbyte))
			
 
				+            info = img.info
			
 
				+            # print( f"ing info dpi {info.get('dpi')}")
			
 
				+            ctm = img_d.get('CTM')
			
 
				+            # print("ctm", ctm)
			
 
				+            pos = img_d.get('pos')
			
 
				+            pdf_pos = [x * ofd_to_pdf_ratio for x in pos]
			
 
				+            # print('pos', pos)
			
 
				+            # print('pdf_pos', pdf_pos)
			
 
				+            # print('ofd_page_size', ofd_page_size)
			
 
				+            # print('pdf_page_size', pdf_page_size)
			
 
				+            if pos:
			
 
				+                if pos[2] <= 0.1 or pos[3] <= 0.1:
			
 
				+                    print('pos[2] <= 0.1 or pos[3] <= 0.1')
			
 
				+                    continue
			
 
				+                x, y = pdf_pos[0], pdf_page_size[3] - pdf_pos[1] - pdf_pos[3]
			
 
				+                w, h = img.size
			
 
				+                ctm = ctm.split(' ')
			
 
				+                ctm = [float(x) for x in ctm]
			
 
				+                a, b, d, e, f, g = ctm
			
 
				+                if b == 0 and d == 0:
			
 
				+                    angle_deg = 0
			
 
				+                else:
			
 
				+                    # 计算旋转角度，考虑可能的镜像翻转
			
 
				+                    angle_rad = math.atan2(b, a)
			
 
				+                    angle_deg = math.degrees(angle_rad)
			
 
				+                    # 调整角度到 0 到 360 度范围内
			
 
				+                    angle_deg = angle_deg % 360
			
 
				+                img = img.rotate(-angle_deg, expand=1)
			
 
				+                img = img.resize((int(pdf_pos[2]), int(pdf_pos[3])), Image.BICUBIC)
			
 
				+                img = image_resize_by_ratio(img, int(pdf_page_size[2]), int(pdf_page_size[3]))
			
 
				+                # img = img.resize((int(pdf_page_size[2]), int(pdf_page_size[3])), Image.BICUBIC)
			
 
				+                # img = img.rotate(180, expand=1)
			
 
				+                w, h = img.size
			
 
				+                # print('jb2 angle_deg, x, y, w, h', angle_deg, x, y, w, h)
			
 
				+                if img.mode == 'P':
			
 
				+                    img = img.convert('RGBA')
			
 
				+                imgReade = ImageReader(img)
			
 
				+                # print('img.size, x, y, w, h, img.mode', img.size, x, y, w, h, img.mode)
			
 
				+                c.drawImage(imgReade, x, y, w, h, 'auto')
			
 
				+
			
 
				+    def draw_img_with_annot(self, canvas, img_list, images, annot_page_size, pdf_page_size, ofd_to_pdf_ratio, annot_page_info):
			
 
				+        """写入图片"""
			
 
				+        c = canvas
			
 
				+        for img_d in img_list:
			
 
				+            image = images.get(img_d["ResourceID"])
			
 
				+            if not image or image.get("suffix").upper() not in self.SupportImgType:
			
 
				+                print('img_d["ResourceID"]', img_d["ResourceID"])
			
 
				+                logger.error(f"not image")
			
 
				+                continue
			
 
				+
			
 
				+            imgbyte = base64.b64decode(image.get('imgb64'))
			
 
				+            if not imgbyte:
			
 
				+                logger.error(f"{image['fileName']} is null")
			
 
				+                continue
			
 
				+
			
 
				+            img = PILImage.open(BytesIO(imgbyte))
			
 
				+            ctm = img_d.get('CTM')
			
 
				+            pos = img_d.get('pos')
			
 
				+            pdf_pos = [x * ofd_to_pdf_ratio for x in pos]
			
 
				+            if pos:
			
 
				+                if pos[2] <= 0.1 or pos[3] <= 0.1:
			
 
				+                    print('pos[2] <= 0.1 or pos[3] <= 0.1')
			
 
				+                    continue
			
 
				+                x, y = pdf_pos[0], pdf_page_size[3] - pdf_pos[1] - pdf_pos[3]
			
 
				+                w, h = img.size
			
 
				+                ctm = ctm.split(' ')
			
 
				+                ctm = [float(x) for x in ctm]
			
 
				+                a, b, d, e, f, g = ctm
			
 
				+                if b == 0 and d == 0:
			
 
				+                    angle_deg = 0
			
 
				+                else:
			
 
				+                    # 计算旋转角度，考虑可能的镜像翻转
			
 
				+                    angle_rad = math.atan2(b, a)
			
 
				+                    angle_deg = math.degrees(angle_rad)
			
 
				+                    # 调整角度到 0 到 360 度范围内
			
 
				+                    angle_deg = angle_deg % 360
			
 
				+
			
 
				+                img = img.rotate(-angle_deg, expand=1)
			
 
				+                print('angle_deg', angle_deg)
			
 
				+
			
 
				+                # 画上注释文字
			
 
				+                # text_list = annot_page_info
			
 
				+                for text_d in annot_page_info:
			
 
				+                    # print('text_d', text_d)
			
 
				+                    # print('img.size', img.size)
			
 
				+                    print('img pos', pos)
			
 
				+                    img = self.draw_odf_char_on_image(text_d, img, pos, annot_page_size)
			
 
				+
			
 
				+                img = img.resize((int(pdf_pos[2]), int(pdf_pos[3])), Image.BICUBIC)
			
 
				+                img = image_resize_by_ratio(img, int(pdf_page_size[2]), int(pdf_page_size[3]))
			
 
				+
			
 
				+                w, h = img.size
			
 
				+                if img.mode == 'P':
			
 
				+                    img = img.convert('RGBA')
			
 
				+                imgReade = ImageReader(img)
			
 
				+                c.drawImage(imgReade, x, y, w, h, 'auto')
			
 
				+
			
 
				+    def draw_signature(self, canvas, signatures_page_list, page_size):
			
 
				+        """
			
 
				+        写入签章
			
 
				+            {
			
 
				+            "sing_page_no": sing_page_no,
			
 
				+            "PageRef": PageRef,
			
 
				+            "Boundary": Boundary,
			
 
				+            "SignedValue": self.file_tree(SignedValue),
			
 
				+                            }
			
 
				+        """
			
 
				+        c = canvas
			
 
				+        try:
			
 
				+            if signatures_page_list:
			
 
				+                # print("signatures_page_list",signatures_page_list)
			
 
				+                for signature_info in signatures_page_list:
			
 
				+                    image = SealExtract()(b64=signature_info.get("SignedValue"))
			
 
				+                    if not image:
			
 
				+                        # logger.info(f"提取不到签章图片")
			
 
				+                        continue
			
 
				+                    else:
			
 
				+                        image_pil = image[0]
			
 
				+
			
 
				+                    pos = [float(i) for i in signature_info.get("Boundary").split(" ")]
			
 
				+
			
 
				+                    imgReade = ImageReader(image_pil)
			
 
				+
			
 
				+                    x = pos[0] * self.OP
			
 
				+                    y = (page_size[3] - pos[1]) * self.OP
			
 
				+
			
 
				+                    w = pos[2] * self.OP
			
 
				+                    h = -pos[3] * self.OP
			
 
				+                    c.drawImage(imgReade, x, y, w, h, 'auto')
			
 
				+                    # print(f"签章写入成功")
			
 
				+            else:
			
 
				+                # 无签章
			
 
				+                pass
			
 
				+        except Exception as e:
			
 
				+            # print(f"签章写入失败 {e}")
			
 
				+            traceback.print_exc()
			
 
				+
			
 
				+    def draw_line_old(self, canvas, line_list, page_size):
			
 
				+        """绘制线条"""
			
 
				+
			
 
				+        # print("绘制",line_list)
			
 
				+
			
 
				+        def match_mode(Abbr: list):
			
 
				+            """
			
 
				+            解析AbbreviatedData
			
 
				+            匹配各种线条模式
			
 
				+            S 定义起始 坐标 x, y
			
 
				+            M 移动到指定坐标 x, y
			
 
				+            L 从当前点移动到指定点 x, y
			
 
				+            Q x1 y1 x2 y2 二次贝塞尔曲线
			
 
				+            B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线
			
 
				+            A 到 x,y 的圆弧 并移动到 x,y  rx 长轴 ry 短轴 angle 旋转角度 large为1表示 大于180 的弧 为0时表示小于180的弧 swcpp 为1 表示顺时针旋转 0 表示逆时针旋转
			
 
				+            C 当前点和SubPath自动闭合
			
 
				+            """
			
 
				+            relu_list = []
			
 
				+            mode = ""
			
 
				+            modes = ["S", "M", "L", "Q", "B", "A", "C"]
			
 
				+            mode_dict = {}
			
 
				+            for idx, i in enumerate(Abbr):
			
 
				+                if i in modes:
			
 
				+                    mode = i
			
 
				+                    if mode_dict:
			
 
				+                        relu_list.append(mode_dict)
			
 
				+                    mode_dict = {"mode": i, "points": []}
			
 
				+
			
 
				+                else:
			
 
				+                    mode_dict["points"].append(i)
			
 
				+
			
 
				+                if idx + 1 == len(Abbr):
			
 
				+                    relu_list.append(mode_dict)
			
 
				+            return relu_list
			
 
				+
			
 
				+        def assemble(relu_list: list):
			
 
				+            start_point = {}
			
 
				+            acticon = []
			
 
				+            for i in relu_list:
			
 
				+                if i.get("mode") == "M":
			
 
				+                    start_point = i
			
 
				+                elif i.get("mode") in ['B', "Q", 'L']:
			
 
				+                    acticon.append({"start_point": start_point,
			
 
				+                                    "end_point": i
			
 
				+                                    })
			
 
				+            return acticon
			
 
				+
			
 
				+        def convert_coord(p_list, direction, page_size, pos):
			
 
				+            """坐标转换ofd2pdf"""
			
 
				+            new_p_l = []
			
 
				+            for p in p_list:
			
 
				+                if direction == "x":
			
 
				+
			
 
				+                    new_p = (float(pos[0]) + float(p)) * self.OP
			
 
				+                else:
			
 
				+                    new_p = (float(page_size[3]) - float(pos[1]) - float(p)) * self.OP
			
 
				+                new_p_l.append(new_p)
			
 
				+            return new_p_l
			
 
				+
			
 
				+        for line in line_list:
			
 
				+            Abbr = line.get("AbbreviatedData").split(" ")  # AbbreviatedData 
			
 
				+            color = line.get("FillColor", [0, 0, 0])
			
 
				+
			
 
				+            relu_list = match_mode(Abbr)
			
 
				+            # TODO 组合 relu_list 1 M L 直线 2 M B*n 三次贝塞尔线 3 M Q*n 二次贝塞尔线
			
 
				+
			
 
				+            # print(relu_list)
			
 
				+
			
 
				+            acticons = assemble(relu_list)
			
 
				+            pos = line.get("pos")
			
 
				+            # print(color)
			
 
				+            if len(color) < 3:
			
 
				+                color = [0, 0, 0]
			
 
				+            canvas.setStrokeColorRGB(*(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255))  # 颜色
			
 
				+
			
 
				+            # 设置线条宽度
			
 
				+            try:
			
 
				+                LineWidth = (float(line.get("LineWidth", "0.25").replace(" ", "")) if \
			
 
				+                                 line.get("LineWidth", "0.25").replace(" ", "") else 0.25) * self.OP
			
 
				+            except Exception as e:
			
 
				+                # logger.error(f"{e}")
			
 
				+                LineWidth = 0.25 * self.OP
			
 
				+
			
 
				+            canvas.setLineWidth(LineWidth)  # 单位为点，2 表示 2 点
			
 
				+
			
 
				+            for acticon in acticons:
			
 
				+                if acticon.get("end_point").get("mode") == 'L':  # 直线
			
 
				+                    x1, y1, x2, y2 = *acticon.get("start_point").get("points"), *acticon.get("end_point").get("points")
			
 
				+                    x1, x2 = convert_coord([x1, x2], "x", page_size, pos)
			
 
				+                    y1, y2 = convert_coord([y1, y2], "y", page_size, pos)
			
 
				+                    # 绘制一条线 x1 y1 x2 y2
			
 
				+                    canvas.line(x1, y1, x2, y2)
			
 
				+
			
 
				+                elif acticon.get("end_point").get("mode") == 'B':  # 三次贝塞尔线
			
 
				+                    continue
			
 
				+                    x1, y1, x2, y2, x3, y3, x4, y4 = *acticon.get("start_point").get("points"), *acticon.get(
			
 
				+                        "end_point").get("points")
			
 
				+                    x1, x2, x3, x4 = convert_coord([x1, x2, x3, x4], "x", page_size, pos)
			
 
				+                    y1, y2, y3, y4 = convert_coord([y1, y2, y3, y4], "y", page_size, pos)
			
 
				+                    # print(x1, y1, x2, y2, x3, y3, x4, y4)
			
 
				+
			
 
				+                    # 绘制三次贝塞尔线
			
 
				+                    canvas.bezier(x1, y1, x2, y2, x3, y3, x4, y4)
			
 
				+
			
 
				+                elif acticon.get("end_point").get("mode") == 'Q':  # 二次贝塞尔线
			
 
				+                    pass
			
 
				+                else:
			
 
				+                    continue
			
 
				+
			
 
				+    def draw_line_old_250619(self, canvas, line_list, page_size):
			
 
				+        def match_mode(Abbr: list):
			
 
				+            """
			
 
				+            解析AbbreviatedData
			
 
				+            匹配各种线条模式
			
 
				+            S 定义起始 坐标 x, y
			
 
				+            M 移动到指定坐标 x, y
			
 
				+            L 从当前点移动到指定点 x, y
			
 
				+            Q x1 y1 x2 y2 二次贝塞尔曲线 从当前点连接一条到点(x2,y2)的二次贝塞尔曲线，并将当前点移动到点(x2,y2)，此贝塞尔曲线使用点(x1,y1)作为其控制点。
			
 
				+            B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线 从当前点连接一条到点(x3,y3)的三次贝塞尔曲线，并将当前点移动到点(x3,y3)，此贝塞尔曲线使用点(x1,y1)和点(x2,y2)作为其控制点。
			
 
				+            A Are 操作数为rx ry angle large sweep x y，从当前点连接一条到点(x,y)的圆弧，并将当前点移动到点(x,y)。
			
 
				+            其中，rx表示椭圆的长轴长度，ry表示椭圆的短轴长度，angle表示椭圆在当前坐标系下旋转的角度，正值为顺时针，
			
 
				+            负值为逆时针，large为 1 时表示对应度数大于 180° 的弧，为 0 时表示对应度数小于 180° 的弧，
			
 
				+            sweep为 1 时表示由圆弧起始点到结束点是顺时针旋转，为 0 时表示由圆弧起始点到结束点是逆时针旋转。
			
 
				+            C 无操作数，其作用是SubPath自动闭合，表示将当前点和SubPath的起始点用线段直接连接。
			
 
				+            """
			
 
				+            relu_list = []
			
 
				+            mode = ""
			
 
				+            modes = ["S", "M", "L", "Q", "B", "A", "C"]
			
 
				+            mode_dict = {}
			
 
				+            for idx, i in enumerate(Abbr):
			
 
				+                if i in modes:
			
 
				+                    mode = i
			
 
				+                    if mode_dict:
			
 
				+                        relu_list.append(mode_dict)
			
 
				+                    mode_dict = {"mode": i, "points": []}
			
 
				+
			
 
				+                else:
			
 
				+                    mode_dict["points"].append(i)
			
 
				+
			
 
				+                if idx + 1 == len(Abbr):
			
 
				+                    relu_list.append(mode_dict)
			
 
				+            return relu_list
			
 
				+
			
 
				+        def assemble(relu_list: list):
			
 
				+            start_point = {}
			
 
				+            acticon = []
			
 
				+
			
 
				+            for i in relu_list:
			
 
				+                if i.get("mode") == "M":
			
 
				+                    if not start_point:
			
 
				+                        start_point = i
			
 
				+                    acticon.append({
			
 
				+                        "start_point": start_point, "end_point": i})
			
 
				+
			
 
				+                elif i.get("mode") in ['B', "Q", 'L']:
			
 
				+                    acticon.append({"start_point": start_point,
			
 
				+                                    "end_point": i
			
 
				+                                    })
			
 
				+                elif i.get("mode") == "C":
			
 
				+                    acticon.append({"start_point": start_point,
			
 
				+                                    "end_point": i
			
 
				+                                    })
			
 
				+                elif i.get("mode") == "A":
			
 
				+                    acticon.append({"start_point": start_point,
			
 
				+                                    "end_point": i
			
 
				+                                    })
			
 
				+                elif i.get("mode") == "S":
			
 
				+                    start_point = i
			
 
				+
			
 
				+            return acticon
			
 
				+
			
 
				+        def convert_coord(p_list, direction, page_size, pos):
			
 
				+            """坐标转换ofd2pdf"""
			
 
				+            new_p_l = []
			
 
				+            # print("p_list", p_list)
			
 
				+            for p in p_list:
			
 
				+                if direction == "x":
			
 
				+                    new_p = (float(pos[0]) + float(p)) * self.OP
			
 
				+                else:
			
 
				+                    new_p = (float(page_size[3]) - float(pos[1]) - float(p)) * self.OP
			
 
				+                new_p_l.append(new_p)
			
 
				+            # print("new_p_l", new_p_l)
			
 
				+            return new_p_l
			
 
				+
			
 
				+        for line in line_list:
			
 
				+            print('one line', "="*20)
			
 
				+            path = canvas.beginPath()
			
 
				+            Abbr = line.get("AbbreviatedData").split(" ")  # AbbreviatedData
			
 
				+            color = line.get("FillColor", [0, 0, 0])
			
 
				+
			
 
				+            relu_list = match_mode(Abbr)
			
 
				+            # TODO 组合 relu_list 1 M L 直线 2 M B*n 三次贝塞尔线 3 M Q*n 二次贝塞尔线
			
 
				+
			
 
				+            # print(relu_list)
			
 
				+
			
 
				+            acticons = assemble(relu_list)
			
 
				+            pos = line.get("pos")
			
 
				+            # print(color)
			
 
				+            if len(color) < 3:
			
 
				+                color = [0, 0, 0]
			
 
				+            canvas.setStrokeColorRGB(*(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255))  # 颜色
			
 
				+
			
 
				+            # 设置线条宽度
			
 
				+            try:
			
 
				+                LineWidth = (float(line.get("LineWidth", "0.25").replace(" ", "")) if \
			
 
				+                                 line.get("LineWidth", "0.25").replace(" ", "") else 0.25) * self.OP
			
 
				+            except Exception as e:
			
 
				+                logger.error(f"{e}")
			
 
				+                LineWidth = 0.25 * self.OP
			
 
				+
			
 
				+            canvas.setLineWidth(LineWidth)  # 单位为点，2 表示 2 点
			
 
				+            cur_point = []
			
 
				+            for acticon in acticons:
			
 
				+                if acticon.get("end_point").get("mode") == 'M':
			
 
				+                    x, y = acticon.get("end_point").get("points")
			
 
				+                    x = convert_coord([x], "x", page_size, pos)[0]
			
 
				+                    y = convert_coord([y], "y", page_size, pos)[0]
			
 
				+                    cur_point = [x, y]
			
 
				+                    path.moveTo(x, y)
			
 
				+
			
 
				+                elif acticon.get("end_point").get("mode") == 'L':  # 直线
			
 
				+                    x, y = acticon.get("end_point").get("points")
			
 
				+                    print('path L x, y', x, y)
			
 
				+                    x = convert_coord([x], "x", page_size, pos)[0]
			
 
				+                    y = convert_coord([y], "y", page_size, pos)[0]
			
 
				+                    print('path L x, y2', x, y)
			
 
				+                    path.lineTo(x, y)
			
 
				+
			
 
				+
			
 
				+                elif acticon.get("end_point").get("mode") == 'B':  # 三次贝塞尔线
			
 
				+                    x1, y1, x2, y2, x3, y3 = acticon.get("end_point").get("points")
			
 
				+                    # print(x1, y1, x2, y2, x3, y3)
			
 
				+                    x1, x2, x3 = convert_coord([x1, x2, x3], "x", page_size, pos)
			
 
				+                    y1, y2, y3 = convert_coord([y1, y2, y3], "y", page_size, pos)
			
 
				+                    cur_point = [x2, y2]
			
 
				+                    path.curveTo(x1, y1, x2, y2, x3, y3)
			
 
				+                    path.moveTo(x3, y3)
			
 
				+
			
 
				+                elif acticon.get("end_point").get("mode") == 'Q':  # 二次贝塞尔线
			
 
				+                    x1, y1, x2, y2 = acticon.get("end_point").get("points")
			
 
				+                    x1, x2 = convert_coord([x1, x2], "x", page_size, pos)
			
 
				+                    y1, y2 = convert_coord([y1, y2], "y", page_size, pos)
			
 
				+                    cur_point = [x2, y2]
			
 
				+                    path.curveTo(x1, y1, x2, y2, x2, y2)
			
 
				+                    path.moveTo(x2, y2)
			
 
				+                elif acticon.get("end_point").get("mode") == 'A':  # 圆弧线
			
 
				+                    x1, y1 = acticon.get("start_point").get("points")
			
 
				+                    rx, ry, startAng, large_arc_flag, sweep_flag, x2, y2 = acticon.get("end_point").get("points")
			
 
				+                    rx_o = rx
			
 
				+                    ry_o = ry
			
 
				+
			
 
				+                    x1, x2, rx = convert_coord([x1, x2, rx], "x", page_size, pos)
			
 
				+                    y1, y2, ry = convert_coord([y1, y2, ry], "y", page_size, pos)
			
 
				+
			
 
				+                    cur_x, cur_y = cur_point
			
 
				+
			
 
				+                    # 绘制圆弧 有问题
			
 
				+                    if rx_o == ry_o:
			
 
				+                        # path.circle(cur_x,cur_y, 20) # 圆
			
 
				+                        path.circle(rx, ry, 20)  # 圆 # 莫名其妙的圆
			
 
				+                    else:
			
 
				+                        print(rx, ry, x2, y2, startAng, large_arc_flag, sweep_flag)
			
 
				+                        path.ellipse(rx, ry, 20, 20, )  # 椭圆
			
 
				+                    # path.arc(rx, ry, x2, y2, startAng=int(startAng), extent=int(sweep_flag))
			
 
				+                    # path.ellipse(rx, ry,x2, y2, ) # 椭圆
			
 
				+                    # path.curveTo(rx, ry ,x2, y2, startAng=int(startAng), extent=int(sweep_flag))
			
 
				+                    path.moveTo(x2, y2)
			
 
				+                    cur_point = [x2, y2]
			
 
				+
			
 
				+                elif acticon.get("end_point").get("mode") == 'C':
			
 
				+                    # canvas.drawPath(path)
			
 
				+                    path.close()
			
 
				+            canvas.drawPath(path)
			
 
				+
			
 
				+    def draw_line(self, canvas, line_list, page_size, pdf_page_size):
			
 
				+        def match_mode(Abbr: list):
			
 
				+            """
			
 
				+            解析AbbreviatedData
			
 
				+            匹配各种线条模式
			
 
				+            S 定义起始 坐标 x, y
			
 
				+            M 移动到指定坐标 x, y
			
 
				+            L 从当前点移动到指定点 x, y
			
 
				+            Q x1 y1 x2 y2 二次贝塞尔曲线 从当前点连接一条到点(x2,y2)的二次贝塞尔曲线，并将当前点移动到点(x2,y2)，此贝塞尔曲线使用点(x1,y1)作为其控制点。
			
 
				+            B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线 从当前点连接一条到点(x3,y3)的三次贝塞尔曲线，并将当前点移动到点(x3,y3)，此贝塞尔曲线使用点(x1,y1)和点(x2,y2)作为其控制点。
			
 
				+            A Are 操作数为rx ry angle large sweep x y，从当前点连接一条到点(x,y)的圆弧，并将当前点移动到点(x,y)。
			
 
				+            其中，rx表示椭圆的长轴长度，ry表示椭圆的短轴长度，angle表示椭圆在当前坐标系下旋转的角度，正值为顺时针，
			
 
				+            负值为逆时针，large为 1 时表示对应度数大于 180° 的弧，为 0 时表示对应度数小于 180° 的弧，
			
 
				+            sweep为 1 时表示由圆弧起始点到结束点是顺时针旋转，为 0 时表示由圆弧起始点到结束点是逆时针旋转。
			
 
				+            C 无操作数，其作用是SubPath自动闭合，表示将当前点和SubPath的起始点用线段直接连接。
			
 
				+            """
			
 
				+            relu_list = []
			
 
				+            mode = ""
			
 
				+            modes = ["S", "M", "L", "Q", "B", "A", "C"]
			
 
				+            mode_dict = {}
			
 
				+            for idx, i in enumerate(Abbr):
			
 
				+                if i in modes:
			
 
				+                    mode = i
			
 
				+                    if mode_dict:
			
 
				+                        relu_list.append(mode_dict)
			
 
				+                    mode_dict = {"mode": i, "points": []}
			
 
				+
			
 
				+                else:
			
 
				+                    mode_dict["points"].append(i)
			
 
				+
			
 
				+                if idx + 1 == len(Abbr):
			
 
				+                    relu_list.append(mode_dict)
			
 
				+            return relu_list
			
 
				+
			
 
				+        def assemble(relu_list: list):
			
 
				+            start_point = {}
			
 
				+            acticon = []
			
 
				+
			
 
				+            for i in relu_list:
			
 
				+                if i.get("mode") == "M":
			
 
				+                    if not start_point:
			
 
				+                        start_point = i
			
 
				+                    acticon.append({
			
 
				+                        "start_point": start_point, "end_point": i})
			
 
				+
			
 
				+                elif i.get("mode") in ['B', "Q", 'L']:
			
 
				+                    acticon.append({"start_point": start_point,
			
 
				+                                    "end_point": i
			
 
				+                                    })
			
 
				+                elif i.get("mode") == "C":
			
 
				+                    acticon.append({"start_point": start_point,
			
 
				+                                    "end_point": i
			
 
				+                                    })
			
 
				+                elif i.get("mode") == "A":
			
 
				+                    acticon.append({"start_point": start_point,
			
 
				+                                    "end_point": i
			
 
				+                                    })
			
 
				+                elif i.get("mode") == "S":
			
 
				+                    start_point = i
			
 
				+
			
 
				+            return acticon
			
 
				+
			
 
				+        for line in line_list:
			
 
				+            # print('one line', "="*20)
			
 
				+            path = canvas.beginPath()
			
 
				+            abbr = line.get("AbbreviatedData").split(" ")
			
 
				+            color = line.get("FillColor", [0, 0, 0])
			
 
				+
			
 
				+            # 线条解析
			
 
				+            relu_list = match_mode(abbr)
			
 
				+            actions = assemble(relu_list)
			
 
				+
			
 
				+            # 变换矩阵
			
 
				+            ctm = line.get("CTM", '')
			
 
				+            ctm = self.get_ctm(ctm)
			
 
				+
			
 
				+            # 文本框范围
			
 
				+            boundary = line.get("pos")
			
 
				+            if len(boundary) != 4:
			
 
				+                print('draw_line not boundary', boundary)
			
 
				+                return
			
 
				+
			
 
				+            # 设置颜色
			
 
				+            if len(color) < 3:
			
 
				+                color = [0, 0, 0]
			
 
				+            canvas.setStrokeColorRGB(*(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255))  # 颜色
			
 
				+
			
 
				+            # 设置线条宽度
			
 
				+            line_w = 0.20 * self.OP
			
 
				+            canvas.setLineWidth(line_w)
			
 
				+
			
 
				+            for action in actions:
			
 
				+                if action.get("end_point").get("mode") == 'M':
			
 
				+                    x, y = action.get("end_point").get("points")
			
 
				+                    # print('path M x, y', x, y)
			
 
				+                    x, y = self.get_actural_p(x, y, ctm, boundary)
			
 
				+                    x = x * self.OP
			
 
				+                    y = pdf_page_size[3] - y * self.OP
			
 
				+                    # print('path M x, y2', x, y)
			
 
				+                    path.moveTo(x, y)
			
 
				+
			
 
				+                elif action.get("end_point").get("mode") == 'L':  # 直线
			
 
				+                    x, y = action.get("end_point").get("points")
			
 
				+                    # print('path L x, y', x, y)
			
 
				+                    x, y = self.get_actural_p(x, y, ctm, boundary)
			
 
				+                    # print('path L x, y1', x, y)
			
 
				+                    x = x * self.OP
			
 
				+                    y = pdf_page_size[3] - y * self.OP
			
 
				+                    # print('path L x, y2', x, y)
			
 
				+                    path.lineTo(x, y)
			
 
				+
			
 
				+                elif action.get("end_point").get("mode") == 'C':
			
 
				+                    path.close()
			
 
				+            canvas.drawPath(path)
			
 
				+
			
 
				+    def get_actural_p(self, x, y, ctm, boundary):
			
 
				+        x, y = float(x), float(y)
			
 
				+        a, b, c, d, e, f = ctm
			
 
				+        left, bottom, width, height = boundary
			
 
				+        # print('left, x, a', left, x, a, type(left), type(x), type(a))
			
 
				+        x = left + x * a
			
 
				+        y2 = bottom + y * d
			
 
				+        y1 = y2 + height
			
 
				+        return x, y2
			
 
				+
			
 
				+    def draw_pdf(self):
			
 
				+        c = canvas.Canvas(self.pdf_io)
			
 
				+        c.setAuthor(self.author)
			
 
				+        page_need_to_image_dict = {}
			
 
				+        for doc_id, doc in enumerate(self.data, start=0):
			
 
				+            # print(1)
			
 
				+            fonts = doc.get("fonts")
			
 
				+            images = doc.get("images")
			
 
				+            default_page_size = doc.get("default_page_size")
			
 
				+            page_size_details = doc.get("page_size")
			
 
				+            # print("page_size_details", page_size_details)
			
 
				+            signatures_page_id = doc.get("signatures_page_id")  # 签证信息
			
 
				+            # annot_page_info = doc.get("annot_page_info")
			
 
				+
			
 
				+            # 注册字体
			
 
				+            # for font_id, font_v in fonts.items():
			
 
				+            #     file_name = font_v.get("FontFile")
			
 
				+            #     font_b64 = font_v.get("font_b64")
			
 
				+            #     if font_b64:
			
 
				+            #         self.font_tool.register_font(os.path.split(file_name)[1], font_v.get("@FontName"), font_b64)
			
 
				+
			
 
				+            # 判断页数是否匹配
			
 
				+            if len(doc.get("page_info")) != len(page_size_details):
			
 
				+                print('len(doc.get("page_info")) != len(page_size_details)')
			
 
				+                continue
			
 
				+
			
 
				+            page_id_list = list(doc.get("page_info").keys())
			
 
				+            try:
			
 
				+                page_id_list.sort(key=lambda x: int(x))
			
 
				+            except:
			
 
				+                traceback.print_exc()
			
 
				+                print('sort page_id_list error!', page_id_list)
			
 
				+                continue
			
 
				+
			
 
				+            # text_img_idwrite = []
			
 
				+            # print("doc.get(page_info)", len(doc.get("page_info")))
			
 
				+            for pi, page_id in enumerate(page_id_list):
			
 
				+                page = doc.get("page_info").get(page_id)
			
 
				+                annot_text_list = doc.get("page_info").get(page_id).get('annot_text_list')
			
 
				+                # print('page111', page)
			
 
				+                # print(f"page_id: {page_id} page_size_details: {page_size_details}")
			
 
				+                # if len(page_size_details) > page_id and page_size_details[page_id]:
			
 
				+                #     page_size = page_size_details[page_id]
			
 
				+                # else:
			
 
				+                #     page_size = default_page_size
			
 
				+                page_size = page_size_details[pi]
			
 
				+                # logger.info(f"page_id {page_id} page_size {page_size}")
			
 
				+                text_list = page.get("text_list")
			
 
				+                img_list = page.get("img_list")
			
 
				+                line_list = page.get("line_list")
			
 
				+                # print("img_list",img_list)
			
 
				+                # print('page_size222', page_size)
			
 
				+                c.setPageSize((page_size[2] * self.OP, page_size[3] * self.OP))
			
 
				+                pdf_page_size = [x * self.OP for x in page_size]
			
 
				+
			
 
				+                # print('len(img_list), len(images), len(text_list), len(line_list)', len(img_list), len(images), len(text_list), len(line_list))
			
 
				+
			
 
				+                # 写入图片
			
 
				+                # print('annot_text_list', annot_text_list)
			
 
				+                # if img_list and annot_text_list:
			
 
				+                #     annot_page_size = doc.get("page_info").get(page_id).get('annot_page_size')
			
 
				+                #     print('annot_page_size111', annot_page_size)
			
 
				+                #     self.draw_img_with_annot(c, img_list, images, annot_page_size, pdf_page_size, self.OP, annot_text_list)
			
 
				+
			
 
				+                if img_list and annot_text_list:
			
 
				+                    page_need_to_image_dict[pi] = True
			
 
				+                else:
			
 
				+                    page_need_to_image_dict[pi] = False
			
 
				+                if img_list:
			
 
				+                    self.draw_img(c, img_list, images, page_size, pdf_page_size, self.OP)
			
 
				+
			
 
				+                # 写入文本
			
 
				+                if text_list:
			
 
				+                    # 特殊中文转为基本中文
			
 
				+                    for line_dict in text_list:
			
 
				+                        text = line_dict.get("text")
			
 
				+                        line_dict['text'] = special_font_to_normal(text)
			
 
				+                        # print('draw_chars, text', text, line_dict.get('pos'))
			
 
				+                    self.draw_chars(c, text_list, fonts, page_size, pdf_page_size)
			
 
				+
			
 
				+                # 绘制线条
			
 
				+                if line_list:
			
 
				+                    # for line in line_list:
			
 
				+                    #     print('line', line)
			
 
				+                    self.draw_line(c, line_list, page_size, pdf_page_size)
			
 
				+
			
 
				+                # 绘制签章
			
 
				+                # if signatures_page_id:
			
 
				+                #     self.draw_signature(c, signatures_page_id.get(page_id), page_size)
			
 
				+
			
 
				+                # print("去写入")
			
 
				+                # print(doc_id,len(self.data))
			
 
				+                # 页码判断逻辑
			
 
				+                # if page_id != len(doc.get("page_info")) - 1 and doc_id != len(self.data):
			
 
				+                #     # print("写入")
			
 
				+                #     c.showPage()
			
 
				+                    # json.dump(text_write,open("text_write.json","w",encoding="utf-8"),ensure_ascii=False)
			
 
				+                c.showPage()
			
 
				+        c.save()
			
 
				+        return page_need_to_image_dict
			
 
				+
			
 
				+    def __call__(self):
			
 
				+        try:
			
 
				+            page_need_to_image_dict = self.draw_pdf()
			
 
				+            self.page_need_to_image_dict = page_need_to_image_dict
			
 
				+            pdfbytes = self.pdf_io.getvalue()
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"{e}")
			
 
				+            logger.error(f"ofd解析失败")
			
 
				+            traceback.print_exc()
			
 
				+            self.gen_empty_pdf()
			
 
				+            self.page_need_to_image_dict = {}
			
 
				+            pdfbytes = self.pdf_io.getvalue()
			
 
				+        return pdfbytes
			
 
				+
			
 
				+
			
 
				+
			
--- a/format_convert/easyofd/easyofd/draw/find_seal_img.py
+++ b/format_convert/easyofd/easyofd/draw/find_seal_img.py
@@ -0,0 +1,113 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME: easyofd read_seal_img
			
 
				+# CREATE_TIME: 2024/5/28 14:13
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: renoyuan
			
 
				+# note: 根据 ASN.1 解析签章 拿到 签章图片
			
 
				+import io
			
 
				+import base64
			
 
				+
			
 
				+from PIL import Image, UnidentifiedImageError
			
 
				+from loguru import logger
			
 
				+from pyasn1.codec.der.decoder import decode
			
 
				+from pyasn1.type import univ
			
 
				+from pyasn1.error import PyAsn1Error
			
 
				+
			
 
				+
			
 
				+
			
 
				+class SealExtract(object):
			
 
				+    def __init__(self,):
			
 
				+        pass
			
 
				+    def read_signed_value(self, path="", b64=""):
			
 
				+        # 读取二进制文件
			
 
				+        if b64:
			
 
				+            binary_data = base64.b64decode(b64)
			
 
				+        elif path:
			
 
				+            # print("seal_path",path)
			
 
				+            with open(path, 'rb') as file:
			
 
				+                binary_data = file.read()
			
 
				+        else:
			
 
				+            return
			
 
				+        # 尝试解码为通用的 ASN.1 结构
			
 
				+        try:
			
 
				+            decoded_data, _ = decode(binary_data)
			
 
				+        except (PyAsn1Error,) as e:
			
 
				+            logger.warning(f"Decoding failed: {e}")
			
 
				+            decoded_data = None
			
 
				+        except (AttributeError,) as e:
			
 
				+            logger.warning(f"AttributeError failed: {e}")
			
 
				+            decoded_data = None
			
 
				+        finally:
			
 
				+           return  decoded_data
			
 
				+
			
 
				+
			
 
				+    def find_octet_strings(self, asn1_data,octet_strings:list):
			
 
				+
			
 
				+        # 递归查找所有的 OctetString 实例
			
 
				+
			
 
				+        if isinstance(asn1_data, univ.OctetString):
			
 
				+
			
 
				+            octet_strings.append(asn1_data)
			
 
				+        elif isinstance(asn1_data, univ.Sequence) or isinstance(asn1_data, univ.Set):
			
 
				+            for component in asn1_data:
			
 
				+                self.find_octet_strings(asn1_data[f"{component}"], octet_strings)
			
 
				+        elif isinstance(asn1_data, univ.Choice):
			
 
				+            self.find_octet_strings(asn1_data.getComponent(), octet_strings)
			
 
				+        elif isinstance(asn1_data, univ.Any):
			
 
				+            try:
			
 
				+                sub_data, _ = decode(asn1_data.asOctets())
			
 
				+                self.find_octet_strings(sub_data, octet_strings)
			
 
				+            except PyAsn1Error:
			
 
				+                pass
			
 
				+
			
 
				+
			
 
				+    def hex_to_image(self, hex_data, image_format='PNG',inx=0):
			
 
				+        """
			
 
				+        将16进制数据转换为图片并保存。
			
 
				+
			
 
				+        :param hex_data: 图片的16进制数据字符串
			
 
				+        :param image_format: 图片的格式，默认为'PNG'
			
 
				+        """
			
 
				+        # 将16进制数据转换为二进制数据
			
 
				+
			
 
				+        binary_data = bytes.fromhex(hex_data)
			
 
				+
			
 
				+        # 创建BytesIO对象以读取二进制数据
			
 
				+        image_stream = io.BytesIO(binary_data)
			
 
				+
			
 
				+        # 使用Pillow打开图像数据并保存
			
 
				+        try:
			
 
				+            image = Image.open(image_stream)
			
 
				+            # image.save(f'{inx}_image.{image_format}', format=image_format)
			
 
				+            # print(f"图片已保存为'image.{image_format}'")
			
 
				+            return image
			
 
				+        except UnidentifiedImageError:
			
 
				+            # logger.info("not img ")
			
 
				+            pass
			
 
				+
			
 
				+    def __call__(self, path="", b64=""):
			
 
				+
			
 
				+        decoded_data = self.read_signed_value(path=path, b64=b64)
			
 
				+        octet_strings = []
			
 
				+        img_list = []  # 目前是只有一个的，若存在多个的话关联后面考虑
			
 
				+        if decoded_data:
			
 
				+            self.find_octet_strings(decoded_data, octet_strings)
			
 
				+
			
 
				+            for i, octet_string in enumerate(octet_strings):
			
 
				+                # logger.info(f"octet_string{octet_string}")
			
 
				+                if str(octet_string.prettyPrint()).startswith("0x"):
			
 
				+
			
 
				+                    img = self.hex_to_image(str(octet_string.prettyPrint())[2:],inx= i)
			
 
				+                    if img:
			
 
				+                        # logger.info("ASN.1 data found.")
			
 
				+                        img_list.append(img)
			
 
				+        else:
			
 
				+            pass
			
 
				+            # logger.info("No valid ASN.1 data found.")
			
 
				+
			
 
				+        return  img_list
			
 
				+
			
 
				+if __name__=="__main__":
			
 
				+    print(SealExtract()(r"F:\code\easyofd\test\1111_xml\Doc_0\Signs\Sign_0\SignedValue.dat" ))
			
 
				+
			
--- a/format_convert/easyofd/easyofd/draw/font_tools.py
+++ b/format_convert/easyofd/easyofd/draw/font_tools.py
@@ -0,0 +1,216 @@
 
				+#!/usr/bin/env python
			
 
				+#-*- coding: utf-8 -*-
			
 
				+#PROJECT_NAME: D:\code\easyofd\easyofd
			
 
				+#CREATE_TIME: 2023-07-27 
			
 
				+#E_MAIL: renoyuan@foxmail.com
			
 
				+#AUTHOR: reno 
			
 
				+#NOTE: 字体处理
			
 
				+import sys
			
 
				+import time
			
 
				+import re
			
 
				+import json
			
 
				+import base64
			
 
				+import zipfile
			
 
				+import os
			
 
				+import shutil
			
 
				+import logging
			
 
				+from io import BytesIO, StringIO
			
 
				+import string
			
 
				+from uuid import uuid1
			
 
				+import random
			
 
				+import traceback
			
 
				+import logging
			
 
				+
			
 
				+
			
 
				+import tempfile
			
 
				+import xmltodict
			
 
				+from fontTools.ttLib import TTFont as ttLib_TTFont
			
 
				+from fontTools.pens.basePen import BasePen
			
 
				+from reportlab.graphics.shapes import Path
			
 
				+from reportlab.lib import colors
			
 
				+from reportlab.graphics import renderPM
			
 
				+from reportlab.graphics.shapes import Group, Drawing, scale
			
 
				+from reportlab import platypus
			
 
				+from reportlab.lib.pagesizes import letter, A4
			
 
				+from reportlab.lib.units import mm,inch
			
 
				+from reportlab.platypus import SimpleDocTemplate, Image
			
 
				+from reportlab.lib.utils import ImageReader
			
 
				+from reportlab.pdfgen import canvas
			
 
				+from reportlab.pdfbase import pdfmetrics
			
 
				+from reportlab.pdfbase.cidfonts import UnicodeCIDFont
			
 
				+from reportlab.pdfbase.ttfonts import TTFont
			
 
				+from concurrent.futures import ThreadPoolExecutor
			
 
				+import threading
			
 
				+import multiprocessing
			
 
				+import PIL
			
 
				+
			
 
				+
			
 
				+from reportlab.lib.fonts import _tt2ps_map 
			
 
				+from reportlab.lib.fonts import _family_alias
			
 
				+
			
 
				+
			
 
				+sys.path.append(os.path.dirname(__file__) + "/../../../../")
			
 
				+
			
 
				+from format_convert.easyofd.easyofd.draw import FONTS
			
 
				+
			
 
				+from loguru import logger
			
 
				+
			
 
				+
			
 
				+
			
 
				+class FontTool(object):
			
 
				+    FONTS = FONTS
			
 
				+    def __init__(self):
			
 
				+        # 初始支持字体
			
 
				+        # 字体检测
			
 
				+        # logger.debug("FontTool init ,read system default Font ... ")
			
 
				+        self.FONTS = self.get_installed_fonts()
			
 
				+        # logger.debug(f"system default Font is \n{self.FONTS} \n{'-'*50}")
			
 
				+
			
 
				+
			
 
				+    def get_system_font_dirs(self,):
			
 
				+        """获取不同操作系统的字体目录"""
			
 
				+        system = os.name
			
 
				+        if system == 'nt':  # Windows
			
 
				+            return [os.path.join(os.environ['WINDIR'], 'Fonts')]
			
 
				+        elif system == 'posix':  # Linux/macOS
			
 
				+            return [
			
 
				+                '/usr/share/fonts',
			
 
				+                '/usr/local/share/fonts',
			
 
				+                os.path.expanduser('~/.fonts'),
			
 
				+                os.path.expanduser('~/.local/share/fonts'),
			
 
				+                '/Library/Fonts',  # macOS
			
 
				+                '/System/Library/Fonts'  # macOS
			
 
				+            ]
			
 
				+        else:
			
 
				+            return []
			
 
				+
			
 
				+    def normalize_font_name(self, font_name):
			
 
				+        """将字体名称规范化，例如 'Times New Roman Bold' -> 'TimesNewRoman-Bold'"""
			
 
				+        # 替换空格为无，并将样式（Bold/Italic等）用连字符连接
			
 
				+        normalized = font_name.replace(' ', '')
			
 
				+        # 处理常见的样式后缀
			
 
				+        for style in ['Bold', 'Italic', 'Regular', 'Light', 'Medium', ]:
			
 
				+            if style in normalized:
			
 
				+                normalized = normalized.replace(style, f'-{style}')
			
 
				+
			
 
				+        # todo 特殊字体名规范 后续存在需要完善
			
 
				+        if normalized ==  "TimesNewRoman" :
			
 
				+            normalized = normalized.replace("TimesNewRoman","Times-Roman")
			
 
				+        return normalized
			
 
				+
			
 
				+    def _process_ttc_font(self, ttc_font):
			
 
				+        """处理ttc文件中的所有字体"""
			
 
				+        def judge_name(name):
			
 
				+            if 'http://' in name or 'https://' in name or len(name) > 50:
			
 
				+                return False
			
 
				+            else:
			
 
				+                return True
			
 
				+        font_names = set()
			
 
				+        try:
			
 
				+            # 获取所有可用的名称记录
			
 
				+            name_records = ttc_font['name'].names
			
 
				+
			
 
				+            for idx, record in enumerate(name_records):
			
 
				+                try:
			
 
				+                    # 尝试获取中文名称（简体中文的language ID是2052）
			
 
				+                    if record.platformID == 3 and record.langID == 2052:
			
 
				+                        cn_name = record.toUnicode()
			
 
				+                        if judge_name(cn_name):
			
 
				+                            font_names.add(cn_name)
			
 
				+
			
 
				+
			
 
				+
			
 
				+                    # 回退到英文名称（language ID 1033）
			
 
				+                    elif record.platformID == 3 and record.langID == 1033:
			
 
				+                        name = record.toUnicode()
			
 
				+                        if judge_name(name):
			
 
				+                            font_names.add(name)
			
 
				+                except:
			
 
				+                    continue
			
 
				+        except KeyError:
			
 
				+            # 如果name表不存在，跳过
			
 
				+            pass
			
 
				+        return font_names
			
 
				+    def get_installed_fonts(self, ):
			
 
				+        """获取所有已安装字体的名称和家族"""
			
 
				+        font_dirs = self.get_system_font_dirs()
			
 
				+        installed_fonts = set()
			
 
				+        for font_dir in font_dirs:
			
 
				+            if not os.path.isdir(font_dir):
			
 
				+                continue
			
 
				+            for root, _, files in os.walk(font_dir):
			
 
				+                for file in files:
			
 
				+                    if file.lower().endswith(('.ttf', '.otf','.ttc')):
			
 
				+                        font_path = os.path.join(root, file)
			
 
				+
			
 
				+                        try:
			
 
				+                            if file.lower().endswith('.ttc'):
			
 
				+                                # 对于ttc文件，读取所有字体
			
 
				+                                ttc_font = ttLib_TTFont(font_path, fontNumber=0)  # 读取第一个字体
			
 
				+                                installed_fonts.update(self._process_ttc_font(ttc_font))
			
 
				+                            else:
			
 
				+                                with ttLib_TTFont(font_path) as font:
			
 
				+                                    # 提取字体全名和家族名
			
 
				+                                    name_cn = font['name'].getName(4, 3, 1, 2052)
			
 
				+                                    if name_cn:
			
 
				+                                        installed_fonts.add(name_cn.toUnicode())
			
 
				+                                    # 4=Full Name, 3=Windows, 1=Unicode
			
 
				+                                    name = font['name'].getName(4, 3, 1, 1033)
			
 
				+                                    if name:
			
 
				+                                        installed_fonts.add(name.toUnicode())
			
 
				+                                    family_cn = font['name'].getName(1, 3, 1, 2052)
			
 
				+                                    if family_cn:
			
 
				+                                        installed_fonts.add(family_cn.toUnicode())
			
 
				+                                    family = font['name'].getName(1, 3, 1, 1033)
			
 
				+                                    if family:  # 1=Family Name
			
 
				+                                        installed_fonts.add(family.toUnicode())
			
 
				+                        except Exception as e:
			
 
				+                            print(f"解析字体 {font_path} 失败: {e}")
			
 
				+        installed_fonts = list(installed_fonts)
			
 
				+        if "宋体" in installed_fonts:
			
 
				+            installed_fonts.remove("宋体")
			
 
				+            installed_fonts.insert(0, "宋体")
			
 
				+        return installed_fonts
			
 
				+
			
 
				+    def is_font_available(self, target_font):
			
 
				+        """检查目标字体是否安装"""
			
 
				+        installed_fonts = self.get_installed_fonts()
			
 
				+        return target_font in installed_fonts
			
 
				+
			
 
				+    
			
 
				+    def font_check(self):
			
 
				+        pass
			
 
				+        # logger.info("f{_tt2ps_map}")
			
 
				+        # logger.info("f{_family_alias}")
			
 
				+        
			
 
				+        # for font in self.FONTS:
			
 
				+        #     if font in _tt2ps_map.values():
			
 
				+        #         logger.info(f"已注册{font}")
			
 
				+        #     else:
			
 
				+        #         logger.warning(f"-{font}-未注册可能导致写入失败")
			
 
				+                 
			
 
				+        
			
 
				+        
			
 
				+    def register_font(self,file_name,FontName,font_b64):
			
 
				+        
			
 
				+        if font_b64:
			
 
				+            
			
 
				+            file_name = os.path.split(file_name)
			
 
				+            # logger.error(f"file_name:{file_name}")
			
 
				+            # logger.info(f"file_name:{file_name}")
			
 
				+            if isinstance(file_name, (tuple, list)):
			
 
				+                    file_name = file_name[1]
			
 
				+            if not FontName:
			
 
				+                FontName = file_name.split(".")[0]
			
 
				+
			
 
				+            try:
			
 
				+                with open(file_name, "wb") as f:
			
 
				+                    f.write(base64.b64decode(font_b64))
			
 
				+                # print("FontName", FontName, "file_name", file_name)
			
 
				+                pdfmetrics.registerFont(TTFont(FontName, file_name))
			
 
				+                self.FONTS.append(FontName)
			
 
				+            except Exception as e:
			
 
				+                logger.error(f"register_font_error:\n{e} \n 包含不支持解析字体格式")
			
 
				+            finally:
			
 
				+                if os.path.exists(file_name):
			
 
				+                    os.remove(file_name)
			
--- a/format_convert/easyofd/easyofd/draw/ofdtemplate.py
+++ b/format_convert/easyofd/easyofd/draw/ofdtemplate.py
@@ -0,0 +1,666 @@
 
				+#!/usr/bin/env python
			
 
				+#-*- coding: utf-8 -*-
			
 
				+#PROJECT_NAME: F:\code\easyofd\easyofd\draw
			
 
				+#CREATE_TIME: 2023-10-30 
			
 
				+#E_MAIL: renoyuan@foxmail.com
			
 
				+#AUTHOR: reno 
			
 
				+#note:  ofd 基础结构模板
			
 
				+import tempfile
			
 
				+import os
			
 
				+import abc
			
 
				+import copy
			
 
				+
			
 
				+from loguru import logger
			
 
				+import xmltodict
			
 
				+import zipfile
			
 
				+
			
 
				+__all__ = ["CurId", "OFDTemplate", "DocumentTemplate", "DocumentResTemplate",
			
 
				+           "PublicResTemplate", "ContentTemplate", "OFDStructure"]
			
 
				+"""
			
 
				+OFD目录结构
			
 
				+    │  OFD.xml
			
 
				+    │  
			
 
				+    └─Doc_0
			
 
				+        │  Document.xml
			
 
				+        │  DocumentRes.xml
			
 
				+        │  PublicRes.xml
			
 
				+        │  
			
 
				+        ├─Annots
			
 
				+        │  │  Annotations.xml
			
 
				+        │  │  
			
 
				+        │  └─Page_0
			
 
				+        │          Annotation.xml
			
 
				+        │          
			
 
				+        ├─Attachs
			
 
				+        │      Attachments.xml
			
 
				+        │      original_invoice.xml
			
 
				+        │      
			
 
				+        ├─Pages
			
 
				+        │  └─Page_0
			
 
				+        │          Content.xml
			
 
				+        │          
			
 
				+        ├─Res
			
 
				+        │      image_80.jb2
			
 
				+        │      
			
 
				+        ├─Signs
			
 
				+        │  │  Signatures.xml
			
 
				+        │  │  
			
 
				+        │  └─Sign_0
			
 
				+        │          Signature.xml
			
 
				+        │          SignedValue.dat
			
 
				+        │          
			
 
				+        ├─Tags
			
 
				+        │      CustomTag.xml
			
 
				+        │      CustomTags.xml
			
 
				+        │      
			
 
				+        └─Tpls
			
 
				+            └─Tpl_0
			
 
				+                    Content.xml
			
 
				+"""
			
 
				+class CurId(object):
			
 
				+    """文档内id控制对象"""
			
 
				+    def __init__(self):
			
 
				+        self.id = 1
			
 
				+        self.used = False
			
 
				+        self.uuid_map = {} # 资源文件生成id的时候手动添加进来后面构建page 可以 匹配ResourceID
			
 
				+
			
 
				+    def add_uuid_map(self, k, v):
			
 
				+        # logger.debug(f"uuid_map add {k}: {v}")
			
 
				+        self.uuid_map[k] = v
			
 
				+    def add(self):
			
 
				+        self.id += 1
			
 
				+
			
 
				+    def get_id(self):
			
 
				+        if self.used:
			
 
				+            self.add()
			
 
				+            return self.id
			
 
				+        if not self.used:
			
 
				+            cur_id = self.id
			
 
				+            self.used =True
			
 
				+            return cur_id
			
 
				+
			
 
				+    def get_max_id(self):
			
 
				+        MaxUnitID = self.id + 1
			
 
				+        return MaxUnitID
			
 
				+
			
 
				+class TemplateBase(object):
			
 
				+    """模板基类"""
			
 
				+    key_map = {}  # 变量名对应 xml 中形式 映射 如 传入   DocID -> ofd:DocID
			
 
				+    id_keys = [ ]  # 对需要的要素添加 "@ID"
			
 
				+    template_name = ""
			
 
				+    def __init__(self,*args,**kwargs):
			
 
				+        # print(args)
			
 
				+        # print(kwargs)
			
 
				+        self.id_obj: CurId = kwargs.get("id_obj")
			
 
				+        # print("id_obj", self.id_obj)
			
 
				+        self.assemble(*args, **kwargs)
			
 
				+
			
 
				+
			
 
				+    def assemble(self,*args, **kwargs):
			
 
				+        """对ofdjson组装"""
			
 
				+
			
 
				+        self.final_json = copy.deepcopy(self.ofdjson)
			
 
				+
			
 
				+        # 往模板里面添加要素值
			
 
				+        if kwargs:
			
 
				+            for k, v in kwargs.items():
			
 
				+                if k in self.key_map:
			
 
				+                    self.modify(self.final_json, self.key_map[k], v)
			
 
				+
			
 
				+        # 添加id
			
 
				+        for id_key in self.id_keys:
			
 
				+            print(f"开始gen_id >> {self.template_name}>>{id_key}")
			
 
				+            # print(f"final_json {self.final_json}")
			
 
				+            self.gen_id(self.final_json, id_key)
			
 
				+
			
 
				+    def gen_id(self,ofdjson, id_key):
			
 
				+        """生成id"""
			
 
				+        # print("id_key ", id_key, "ofdjson ", ofdjson)
			
 
				+
			
 
				+        for k, v in ofdjson.items():
			
 
				+            if k == id_key:
			
 
				+                # 添加id
			
 
				+                if isinstance(ofdjson[k], dict):
			
 
				+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
			
 
				+
			
 
				+                    # logger.info(f"添加id -> {ofdjson[k]}")
			
 
				+                elif isinstance(ofdjson[k], list):
			
 
				+                    for i in ofdjson[k]:
			
 
				+                        i["@ID"] = f"{self.id_obj.get_id()}"
			
 
				+
			
 
				+                        # logger.info(f"添加id ->i {i}")
			
 
				+
			
 
				+            elif isinstance(v, dict):
			
 
				+                # logger.debug(f"dict_v{v}")
			
 
				+                self.gen_id(v, id_key)
			
 
				+
			
 
				+
			
 
				+            elif isinstance(v, list):
			
 
				+                for v_cell in v:
			
 
				+                    if isinstance(v_cell, dict):
			
 
				+                        # logger.debug(f"dict_v{v}")
			
 
				+                        self.gen_id(v_cell, id_key)
			
 
				+
			
 
				+                    
			
 
				+    def modify(self, ofdjson, key, value):
			
 
				+        """对指定key的值更改  多个会统一改"""
			
 
				+        
			
 
				+        for k, v in ofdjson.items():
			
 
				+            if k == key:
			
 
				+                ofdjson[k] = value
			
 
				+            elif isinstance(v, dict):
			
 
				+                self.modify(v, key, value)
			
 
				+            elif isinstance(v, list):
			
 
				+                for v_cell in v:
			
 
				+                    if isinstance(v_cell, dict):
			
 
				+                        self.modify(v_cell, key, value)
			
 
				+    
			
 
				+    def save(self, path):
			
 
				+        xml_data = xmltodict.unparse(self.final_json, pretty=True)
			
 
				+        with open(path, "w", encoding="utf-8") as f:
			
 
				+            f.write(xml_data)
			
 
				+
			
 
				+class OFDTemplate(TemplateBase):
			
 
				+    """根节点全局唯一 OFD.xml"""
			
 
				+    template_name = "OFD"
			
 
				+    key_map = {"Author": "ofd:Author", "DocID": "ofd:DocID"  ,"CreationDate": "ofd:CreationDate"
			
 
				+    }
			
 
				+
			
 
				+    ofdjson = {
			
 
				+
			
 
				+        "ofd:OFD": {
			
 
				+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
			
 
				+            "@Version": "1.1",
			
 
				+            "@DocType": "OFD",
			
 
				+            "ofd:DocBody": [{
			
 
				+                "ofd:DocInfo": {
			
 
				+                    "ofd:DocID": "0C1D4F7159954EEEDE517F7285E84DC4",
			
 
				+                    "ofd:Creator": "easyofd",
			
 
				+                    "ofd:author": "renoyuan",
			
 
				+                    "ofd:authoremail": "renoyuan@foxmail.com",
			
 
				+                    "ofd:CreatorVersion": "1.0",
			
 
				+                    "ofd:CreationDate": "2023-10-27"
			
 
				+                },
			
 
				+                "ofd:DocRoot": "Doc_0/Document.xml"
			
 
				+            }]
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+class DocumentTemplate(TemplateBase):
			
 
				+    """DOC 内唯一 表示DOC内部结构 Document.xml
			
 
				+
			
 
				+    """
			
 
				+    template_name = "Document"
			
 
				+    key_map = {"Page": "ofd:Page","PhysicalBox":"ofd:PhysicalBox"}
			
 
				+    id_keys = ["ofd:Page"]
			
 
				+    ofdjson ={
			
 
				+    "ofd:Document": {
			
 
				+        "@xmlns:ofd": "http://blog.yuanhaiying.cn",
			
 
				+        "ofd:CommonData": {
			
 
				+            "ofd:MaxUnitID": 0,
			
 
				+            "ofd:PageArea": {
			
 
				+                "ofd:PhysicalBox": "0 0 140 90"
			
 
				+            },
			
 
				+            "ofd:PublicRes": "PublicRes.xml",
			
 
				+            "ofd:DocumentRes": "DocumentRes.xml"
			
 
				+        },
			
 
				+        "ofd:Pages":
			
 
				+            {
			
 
				+            "ofd:Page": [{
			
 
				+                "@ID": 0,
			
 
				+                "@BaseLoc": "Pages/Page_0/Content.xml"
			
 
				+            }]
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+    def update_max_unit_id(self, final_json=None):
			
 
				+        if not final_json:
			
 
				+            final_json = self.final_json
			
 
				+
			
 
				+        for k, v in final_json.items():
			
 
				+            if k == "ofd:MaxUnitID":
			
 
				+                final_json["ofd:MaxUnitID"]=self.id_obj.get_max_id()
			
 
				+                return
			
 
				+
			
 
				+            elif isinstance(v, dict):
			
 
				+                self.update_max_unit_id(v)
			
 
				+            elif isinstance(v, list):
			
 
				+                for v_cell in v:
			
 
				+                    if isinstance(v_cell, dict):
			
 
				+                        self.update_max_unit_id(v_cell)
			
 
				+
			
 
				+    def update_page(self,page_num):
			
 
				+        pass
			
 
				+
			
 
				+class DocumentResTemplate(TemplateBase):
			
 
				+    """DOC 内唯一 表示MultyMedia 资源信息 如 图片 DocumentRes.xml """
			
 
				+    template_name = "DocumentRes"
			
 
				+    key_map = {"MultiMedia": "ofd:MultiMedia"}
			
 
				+    id_keys = ["ofd:DrawParam", "ofd:MultiMedia"]
			
 
				+    ofdjson = {
			
 
				+        "ofd:Res": {
			
 
				+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
			
 
				+            "@BaseLoc": "Res",
			
 
				+            "ofd:MultiMedias": {
			
 
				+                "ofd:MultiMedia": [
			
 
				+                    {
			
 
				+                        "@ID": 0,
			
 
				+                        "@Type": "Image",
			
 
				+                        "ofd:MediaFile": "Image_2.jpg"
			
 
				+                    }
			
 
				+                ]
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    def gen_id(self,ofdjson, id_key):
			
 
				+        """生成id"""
			
 
				+        # print("id_key ", id_key, "ofdjson ", ofdjson)
			
 
				+
			
 
				+        for k, v in ofdjson.items():
			
 
				+            if k == id_key:
			
 
				+                # 添加id
			
 
				+                if isinstance(ofdjson[k], dict):
			
 
				+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
			
 
				+
			
 
				+                    res_uuid = ofdjson[k].get("res_uuid")
			
 
				+                    if res_uuid:
			
 
				+                        self.id_obj.add_uuid_map(res_uuid, ofdjson[k]["@ID"])
			
 
				+                    # logger.info(f"添加id -> {ofdjson[k]}")
			
 
				+                elif isinstance(ofdjson[k], list):
			
 
				+                    for i in ofdjson[k]:
			
 
				+
			
 
				+                        i["@ID"] = f"{self.id_obj.get_id()}"
			
 
				+                        res_uuid = i.get("res_uuid")
			
 
				+                        if res_uuid:
			
 
				+                            self.id_obj.add_uuid_map(res_uuid, i["@ID"])
			
 
				+                        # logger.info(f"添加id ->i {i}")
			
 
				+
			
 
				+            elif isinstance(v, dict):
			
 
				+                # logger.debug(f"dict_v{v}")
			
 
				+                self.gen_id(v, id_key)
			
 
				+
			
 
				+
			
 
				+            elif isinstance(v, list):
			
 
				+                for v_cell in v:
			
 
				+                    if isinstance(v_cell, dict):
			
 
				+                        # logger.debug(f"dict_v{v}")
			
 
				+                        self.gen_id(v_cell, id_key)
			
 
				+
			
 
				+class PublicResTemplate(TemplateBase):
			
 
				+    """DOC 内唯一 公共配置资源信息 如 Font  Color 等 PublicRes.xml"""
			
 
				+    template_name = "PulicRes"
			
 
				+    key_map = {"Font": "ofd:Font"}
			
 
				+    id_keys = ["ofd:ColorSpace", "ofd:Font"]
			
 
				+    ofdjson = {
			
 
				+        "ofd:Res": {
			
 
				+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
			
 
				+            "@BaseLoc": "Res",
			
 
				+            "ofd:ColorSpaces": {
			
 
				+                "ofd:ColorSpace": {
			
 
				+                    "@ID": 0,
			
 
				+                    "@Type": "RGB",
			
 
				+                    "@BitsPerComponent": "8",
			
 
				+                    "#text":""
			
 
				+                }
			
 
				+            },
			
 
				+            "ofd:Fonts": {
			
 
				+                "ofd:Font": [
			
 
				+                {
			
 
				+                    "@ID": 0,
			
 
				+                    "@FontName": "宋体",
			
 
				+                    "@FamilyName": "宋体",
			
 
				+
			
 
				+                }
			
 
				+            ]
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    def gen_id(self,ofdjson, id_key):
			
 
				+        """生成id"""
			
 
				+        # print("id_key ", id_key, "ofdjson ", ofdjson)
			
 
				+
			
 
				+        for k, v in ofdjson.items():
			
 
				+            if k == id_key:
			
 
				+                # 添加id
			
 
				+                if isinstance(ofdjson[k], dict):
			
 
				+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
			
 
				+                    res_uuid = ofdjson[k].get("res_uuid")
			
 
				+                    if res_uuid:
			
 
				+                        self.id_obj.add_uuid_map(res_uuid, ofdjson[k]["@ID"])
			
 
				+                    # logger.info(f"添加id -> {ofdjson[k]}")
			
 
				+                elif isinstance(ofdjson[k], list):
			
 
				+                    for i in ofdjson[k]:
			
 
				+
			
 
				+                        i["@ID"] = f"{self.id_obj.get_id()}"
			
 
				+                        res_uuid = i.get("res_uuid")
			
 
				+                        if res_uuid:
			
 
				+                            self.id_obj.add_uuid_map(res_uuid, i["@ID"])
			
 
				+                        # logger.info(f"添加id ->i {i}")
			
 
				+
			
 
				+            elif isinstance(v, dict):
			
 
				+                # logger.debug(f"dict_v{v}")
			
 
				+                self.gen_id(v, id_key)
			
 
				+
			
 
				+
			
 
				+            elif isinstance(v, list):
			
 
				+                for v_cell in v:
			
 
				+                    if isinstance(v_cell, dict):
			
 
				+                        # logger.debug(f"dict_v{v}")
			
 
				+                        self.gen_id(v_cell, id_key)
			
 
				+
			
 
				+'''
			
 
				+    "ofd:Font": [
			
 
				+
			
 
				+    {
			
 
				+        "@ID": 0,
			
 
				+        "@FontName": "STSong",
			
 
				+        "@FamilyName": "SimSun",
			
 
				+        "@Serif": "true",
			
 
				+        "@FixedWidth": "true",
			
 
				+        "@Charset": "prc"
			
 
				+    }
			
 
				+            "ofd:Area": {
			
 
				+            "ofd:PhysicalBox": "0 0 210 140"
			
 
				+        },
			
 
				+'''
			
 
				+
			
 
				+
			
 
				+class ContentTemplate(TemplateBase):
			
 
				+    """正文部分 Content.xml"""
			
 
				+    #"@Type": "Body",
			
 
				+    template_name = "Content"
			
 
				+    key_map = {"ImageObject": "ofd:ImageObject",
			
 
				+               "PathObject": "ofd:PathObject",
			
 
				+               "TextObject": "ofd:TextObject",
			
 
				+               "CGTransform": "ofd:CGTransform",
			
 
				+               "PhysicalBox": "ofd:PhysicalBox",
			
 
				+               }
			
 
				+    id_keys = ["ofd:Layer", "ofd:TextObject", "ofd:PathObject", "ofd:Clips", "ofd:ImageObject"]
			
 
				+    correlate_map = {"ofd:TextObject": "@Font",
			
 
				+                     "ofd:ImageObject": "@ResourceID"
			
 
				+
			
 
				+                     }
			
 
				+
			
 
				+    ofdjson = {
			
 
				+    "ofd:Page": {
			
 
				+        "@xmlns:ofd": "http://blog.yuanhaiying.cn",
			
 
				+
			
 
				+        "ofd:Content": {
			
 
				+            "ofd:PageArea": {
			
 
				+                "ofd:PhysicalBox": "0 0 210 140"
			
 
				+            },
			
 
				+            "ofd:Layer":  {
			
 
				+                "@ID": 0,
			
 
				+                "@Type": "Foreground",
			
 
				+
			
 
				+
			
 
				+                "ofd:TextObject": [{
			
 
				+                        "@ID": 0,
			
 
				+                        "@CTM": "7.054 0 0 7.054 0 134.026",
			
 
				+                        "@Boundary": "69 7 72 7.6749",
			
 
				+                        "@Font": "69",
			
 
				+                        "@Size": "6.7028",
			
 
				+                        "ofd:FillColor": {
			
 
				+                            "@ColorSpace": "4",
			
 
				+                            "@Value": "156 82 35"
			
 
				+                        },
			
 
				+                        "ofd:CGTransform": {
			
 
				+                            "@CodePosition": "0",
			
 
				+                            "@CodeCount": "10",
			
 
				+                            "@GlyphCount": "10",
			
 
				+                            "ofd:Glyphs": "18 10 11 42 60 53 24 11 42 61"
			
 
				+                        },
			
 
				+                        "ofd:TextCode": {
			
 
				+                            "@X": "13.925",
			
 
				+                            "@Y": "10",
			
 
				+                            "@DeltaX": "7 7 7 7 7 7 7 7 7",
			
 
				+                            "#text": "电⼦发票（普通发票）"
			
 
				+                        }
			
 
				+                    }],
			
 
				+                "ofd:ImageObject": []
			
 
				+                }
			
 
				+        }}}
			
 
				+    def __init__(self,*args,**kwargs):
			
 
				+        # print(args)
			
 
				+        # print(kwargs)
			
 
				+        super().__init__(*args, **kwargs)
			
 
				+        # 关联res_uuid
			
 
				+        for key, targe_key in self.correlate_map.items():
			
 
				+            self.correlate_res_uuid(self.final_json,key,targe_key)
			
 
				+
			
 
				+    def correlate_res_uuid(self, ofdjson,key,targe_key):
			
 
				+        """correlate_res_uuid"""
			
 
				+        print("========uuid_map", self.id_obj.uuid_map)
			
 
				+        for k, v in ofdjson.items():
			
 
				+            if k == key:
			
 
				+                res_uuid = v_cell.pop("res_uuid", None)
			
 
				+                if isinstance(v, dict) and res_uuid:
			
 
				+
			
 
				+                    v[targe_key] = self.id_obj.uuid_map[res_uuid]
			
 
				+                    # logger.debug(f'{targe_key} >>> {v[targe_key]} -- {res_uuid}')
			
 
				+                elif isinstance(v, list):
			
 
				+                    for v_cell in v:
			
 
				+                        res_uuid = None
			
 
				+                        if isinstance(v_cell, dict):
			
 
				+                            res_uuid = v_cell.pop("res_uuid", None)
			
 
				+                        if isinstance(v_cell, dict) and res_uuid:
			
 
				+
			
 
				+                            v_cell[targe_key] = self.id_obj.uuid_map[res_uuid]
			
 
				+                            # logger.debug(f'{targe_key} >>> {v_cell[targe_key]} -- {res_uuid}')
			
 
				+                        else:
			
 
				+                            pass
			
 
				+                            # print(f"v_cell {v_cell}")
			
 
				+                    pass
			
 
				+                else:
			
 
				+                    pass
			
 
				+            elif isinstance(v, dict):
			
 
				+                self.correlate_res_uuid(v, key, targe_key)
			
 
				+            elif isinstance(v, list):
			
 
				+                for v_cell in v:
			
 
				+                    if isinstance(v_cell, dict):
			
 
				+                        self.correlate_res_uuid(v_cell, key, targe_key)
			
 
				+
			
 
				+
			
 
				+'''
			
 
				+                "ofd:PathObject": [{
			
 
				+                        "@ID": 0,
			
 
				+                        "@CTM": "0.3527 0 0 -0.3527 0.35 141.43001",
			
 
				+                        "@Boundary": "-0.35 -0.35 212.33 141.78999",
			
 
				+                        "@LineWidth": "1",
			
 
				+                        "@MiterLimit": "10",
			
 
				+                        "@Stroke": "false",
			
 
				+                        "@Fill": "true",
			
 
				+                        "ofd:FillColor": {
			
 
				+                            "@ColorSpace": "4",
			
 
				+                            "@Value": "255 255 255"
			
 
				+                        },
			
 
				+                        "ofd:StrokeColor": {
			
 
				+                            "@ColorSpace": "4",
			
 
				+                            "@Value": "0 0 0"
			
 
				+                        },
			
 
				+                        "ofd:Clips": {
			
 
				+                            "ofd:Clip": {
			
 
				+                                "ofd:Area": {
			
 
				+                                    "ofd:Path": {
			
 
				+                                        "@ID": 0,
			
 
				+                                        "@Boundary": "0.00766 -0.00763 600 400.00003",
			
 
				+                                        "@Stroke": "false",
			
 
				+                                        "@Fill": "true",
			
 
				+                                        "ofd:AbbreviatedData": "M 0 0 L 600 0 L 600 400.00003 L 0 400.00003 C"
			
 
				+                                    }
			
 
				+                                }
			
 
				+                            }
			
 
				+                        },
			
 
				+                        "ofd:AbbreviatedData": "M -1 401 L 601 401 L 601 -1 L -1 -1 C"
			
 
				+                    },],
			
 
				+                
			
 
				+"ofd:ImageObject": [{
			
 
				+                        "@ID": 0,
			
 
				+                        "@CTM": "19.7512 0 0 19.7512 0 0",
			
 
				+                        "@Boundary": "7.23035 7.40671 19.7512 19.7512",
			
 
				+                        "@ResourceID": "104"
			
 
				+                    }],
			
 
				+'''
			
 
				+
			
 
				+class OFDStructure(object):
			
 
				+    """OFD structure"""
			
 
				+    def __init__(self, name, ofd=None, document=None,
			
 
				+                 document_res=None, public_res=None,
			
 
				+                  content_res:list=[], res_static: dict={}):
			
 
				+        # 初始化的时候会先自动初始化 默认参数值
			
 
				+        id_obj = CurId()
			
 
				+        self.name = name
			
 
				+        self.ofd = ofd if ofd else OFDTemplate(id_obj=id_obj)
			
 
				+        self.document = document if document else DocumentTemplate(id_obj=id_obj)
			
 
				+        self.document_res = document_res if document_res else  DocumentResTemplate(id_obj=id_obj)
			
 
				+        self.public_res = public_res if public_res else PublicResTemplate(id_obj=id_obj)
			
 
				+        self.content_res = content_res if content_res else [ContentTemplate(id_obj=id_obj)]
			
 
				+        self.res_static = res_static
			
 
				+       
			
 
				+    def __call__(self, test=False):
			
 
				+        """写入文件生成ofd"""
			
 
				+        with tempfile.TemporaryDirectory() as t_dir:
			
 
				+            if test:
			
 
				+                temp_dir = r"./test"
			
 
				+                os.mkdir(temp_dir)
			
 
				+            else:
			
 
				+                temp_dir = t_dir
			
 
				+            # 创建过程目录
			
 
				+            temp_dir_doc_0 = os.path.join(temp_dir, 'Doc_0')
			
 
				+            temp_dir_pages = os.path.join(temp_dir, 'Doc_0', "Pages")
			
 
				+            temp_dir_res = os.path.join(temp_dir, 'Doc_0', "Res")  # 静态资源路径
			
 
				+            for i in [temp_dir_doc_0, temp_dir_pages, temp_dir_res]:
			
 
				+                # print(i)
			
 
				+                os.mkdir(i)
			
 
				+
			
 
				+            # 写入 OFD
			
 
				+            self.ofd.save(os.path.join(temp_dir, 'OFD.xml'))
			
 
				+
			
 
				+            # 更新 max_unit_id & 写入 Document
			
 
				+            self.document.update_max_unit_id()
			
 
				+            self.document.save(os.path.join(temp_dir_doc_0, 'Document.xml'))
			
 
				+
			
 
				+            # 写入 DocumentRes
			
 
				+            self.document_res.save(os.path.join(temp_dir_doc_0, 'DocumentRes.xml'))
			
 
				+
			
 
				+            # 写入 PublicRes
			
 
				+            self.public_res.save(os.path.join(temp_dir_doc_0, 'PublicRes.xml'))
			
 
				+
			
 
				+            # 写入 content_res
			
 
				+            for idx, page in enumerate(self.content_res):
			
 
				+                temp_dir_pages_idx = os.path.join(temp_dir_pages, f"Page_{idx}")
			
 
				+                os.mkdir(temp_dir_pages_idx)
			
 
				+                # os.mkdir(i)
			
 
				+                page.save(os.path.join(temp_dir_pages_idx, 'Content.xml'))
			
 
				+
			
 
				+            # 写入静态资源
			
 
				+            for k, v in self.res_static.items():
			
 
				+                  with open(os.path.join(temp_dir_res, k), "wb") as f:
			
 
				+                      f.write(v)
			
 
				+
			
 
				+            # 打包成ofd
			
 
				+            zip = zipfile.ZipFile("test.ofd", "w", zipfile.ZIP_DEFLATED)
			
 
				+            for path, dirnames, filenames in os.walk(temp_dir):
			
 
				+                # 去掉目标跟路径，只对目标文件夹下边的文件及文件夹进行压缩
			
 
				+                fpath = path.replace(temp_dir, '')
			
 
				+
			
 
				+                for filename in filenames:
			
 
				+                    zip.write(os.path.join(path, filename), os.path.join(fpath, filename))
			
 
				+            zip.close()
			
 
				+            with open("test.ofd", "rb") as f:
			
 
				+                content = f.read()
			
 
				+            if os.path.exists("test.ofd"):
			
 
				+               os.remove("test.ofd")
			
 
				+            return content
			
 
				+
			
 
				+if  __name__ == "__main__":
			
 
				+    print("---------")
			
 
				+    # 资源文件
			
 
				+    img_path = r"F:\code\easyofd\test\test_img0.jpg"
			
 
				+    # with open(img_path, "rb") as f:
			
 
				+    #     content = f.read()
			
 
				+    content = b""
			
 
				+    res_static = {"Image_0.jpg": content}
			
 
				+
			
 
				+    # 构建数据
			
 
				+    font = [
			
 
				+            {
			
 
				+
			
 
				+                "@FontName": "宋体",
			
 
				+                "@FamilyName": "宋体",
			
 
				+
			
 
				+            }
			
 
				+            ]
			
 
				+
			
 
				+    MultiMedia = [
			
 
				+                {
			
 
				+
			
 
				+                    "@Type": "Image",
			
 
				+                    "ofd:MediaFile": "Image_0.jpg"
			
 
				+                }
			
 
				+            ]
			
 
				+
			
 
				+    ImageObject = [{
			
 
				+
			
 
				+                        "@CTM": "200 0 0 140 0 0",
			
 
				+                        "@Boundary": "0 0 200 140",
			
 
				+                        "@ResourceID": "55"
			
 
				+                    }]
			
 
				+    TextObject = [
			
 
				+        {
			
 
				+
			
 
				+
			
 
				+        "@Boundary": "50 5 100 20",
			
 
				+        "@Font": "2",
			
 
				+        "@Size": "5",
			
 
				+        "ofd:FillColor": {
			
 
				+
			
 
				+            "@Value": "156 82 35",
			
 
				+            "@ColorSpace" : "1"
			
 
				+        },
			
 
				+
			
 
				+        "ofd:TextCode": {
			
 
				+            "@X": "5",
			
 
				+            "@Y": "5",
			
 
				+            "@DeltaX": "7 7 7 7 7 7 7 7 7",
			
 
				+            "#text": "电⼦发票（普通发票）"
			
 
				+        }
			
 
				+    }, {
			
 
				+
			
 
				+
			
 
				+        "@Boundary": "0 0 100 100",
			
 
				+        "@Font": "2",
			
 
				+        "@Size": "10",
			
 
				+        "ofd:FillColor": {
			
 
				+
			
 
				+            "@Value": "156 82 35"
			
 
				+        },
			
 
				+
			
 
				+        "ofd:TextCode": {
			
 
				+            "@X": "0",
			
 
				+            "@Y": "0",
			
 
				+            "@DeltaX": "0",
			
 
				+            "#text": "电"
			
 
				+        }
			
 
				+    }
			
 
				+    ]
			
 
				+
			
 
				+    # 实例化模板
			
 
				+    id_obj = CurId()
			
 
				+    print("id_obj实例化", id_obj)
			
 
				+
			
 
				+    ofd = OFDTemplate(id_obj=id_obj)
			
 
				+    document = DocumentTemplate(id_obj=id_obj)
			
 
				+    public_res = PublicResTemplate(Font=font, id_obj=id_obj)
			
 
				+    document_res = DocumentResTemplate(MultiMedia=MultiMedia, id_obj=id_obj)
			
 
				+    # ImageObject=ImageObject
			
 
				+    content_res = ContentTemplate(CGTransform=[], PathObject=[], TextObject=TextObject, ImageObject=[], id_obj=id_obj)
			
 
				+
			
 
				+
			
 
				+
			
 
				+    ofd_byte = OFDStructure("123",ofd=ofd, document=document,public_res=public_res,
			
 
				+                            document_res=document_res, content_res=[content_res], res_static=res_static)(test=True)
			
 
				+
			
 
				+    with open("test.ofd", "wb") as f:
			
 
				+        content = f.write(ofd_byte)
			
--- a/format_convert/easyofd/easyofd/draw/pdf_parse.py
+++ b/format_convert/easyofd/easyofd/draw/pdf_parse.py
@@ -0,0 +1,966 @@
 
				+import os
			
 
				+import re
			
 
				+import io
			
 
				+
			
 
				+import json
			
 
				+import time
			
 
				+import copy
			
 
				+import string
			
 
				+import random
			
 
				+from uuid import uuid1
			
 
				+from decimal import Decimal
			
 
				+from collections import OrderedDict
			
 
				+
			
 
				+# 第三方包
			
 
				+import fitz
			
 
				+from PIL import Image
			
 
				+# import pdfplumber
			
 
				+
			
 
				+__ALL__ = ['pdf_ocr',"DPFParser"]
			
 
				+
			
 
				+class MyEncoder(json.JSONEncoder):
			
 
				+    def default(self, obj):
			
 
				+        if isinstance(obj, bytes):
			
 
				+            return str(obj)
			
 
				+        elif isinstance(obj, Decimal):
			
 
				+            return float(obj)
			
 
				+        return json.JSONEncoder.default(self, obj)
			
 
				+
			
 
				+class DPFParser(object):
			
 
				+    def __init__(self, ):
			
 
				+        pass
			
 
				+
			
 
				+    def extract_text_with_details(self, pdf_bytes):
			
 
				+        """
			
 
				+        提取PDF每页的文本及其位置、字体信息。
			
 
				+
			
 
				+        :param pdf_path: PDF文件路径
			
 
				+        :return: 包含每页文本及其详细信息的列表
			
 
				+        [[
			
 
				+
			
 
				+        ]]
			
 
				+        """
			
 
				+        details_list = []
			
 
				+        pdf_stream = io.BytesIO(pdf_bytes)
			
 
				+
			
 
				+        # 使用fitz.open直接打开BytesIO对象
			
 
				+
			
 
				+        with fitz.open(stream=pdf_stream, filetype="pdf") as doc:
			
 
				+            res_uuid_map = {
			
 
				+                "img": {},
			
 
				+                "font": {},
			
 
				+                "other": {}
			
 
				+            } # 全局资源标识
			
 
				+            for page_num in range(len(doc)):
			
 
				+
			
 
				+
			
 
				+                page_details_list = []  # 页面内信息
			
 
				+                page = doc.load_page(page_num)
			
 
				+                rect = page.rect
			
 
				+                width = rect.width
			
 
				+                height = rect.height
			
 
				+                if res_uuid_map["other"].get("page_size"):
			
 
				+                    res_uuid_map["other"]["page_size"][page_num] = [width,height]
			
 
				+                else :
			
 
				+                    res_uuid_map["other"]["page_size"] = {page_num: [width, height]}
			
 
				+                blocks = page.get_text("dict").get("blocks")  # 获取文本块信息
			
 
				+                image_list = page.get_images(full=True)  # 获取页面上所有图片的详细信息
			
 
				+                # print(blocks)
			
 
				+                # 获取页面内文本信息
			
 
				+                for block in blocks:
			
 
				+                    block_text = block.get("text", "")
			
 
				+                    block_rect = block["bbox"]  # 文本块的边界框，格式为[x0, y0, x1, y1]
			
 
				+
			
 
				+                    # 遍历块中的每一行
			
 
				+                    for line in block.get("lines", []):
			
 
				+                        line_text = line.get("spans", [{}])[0].get("text", "")  # 单行文本
			
 
				+                        line_rect = line["bbox"]  # 行的边界框
			
 
				+
			
 
				+                        # 遍历行中的每一个跨度（span），获取字体信息
			
 
				+                        for span in line.get("spans", []):
			
 
				+                            span_text = span.get("text", "")
			
 
				+                            font_size = span.get("size")  # 字体大小
			
 
				+                            font_name = span.get("font")  # 字体名称
			
 
				+                            res_uuid = None
			
 
				+                            if font_name not in res_uuid_map["font"].values():
			
 
				+                                res_uuid = str(uuid1())
			
 
				+                                res_uuid_map["font"][res_uuid] = font_name
			
 
				+                            else:
			
 
				+                                keys = list(res_uuid_map["font"].keys())
			
 
				+                                vs = list(res_uuid_map["font"].values())
			
 
				+                                idx = vs.index(font_name)
			
 
				+                                res_uuid =keys[idx]
			
 
				+                            font_color = span.get("color")  # 字体颜色，默认可能没有
			
 
				+                            span_rect = (
			
 
				+                            line_rect[0], line_rect[1], line_rect[2], line_rect[3])  # 使用行的边界框作为参考，具体到单个字符或词可能需要更复杂的处理
			
 
				+
			
 
				+                            # 打印或存储信息
			
 
				+                            print(
			
 
				+                                f"Page: {page_num }, Text: '{span_text}', Font: {font_name}, Size: {font_size}, "
			
 
				+                                f"Color: {font_color}, Rect: {span_rect} ,res_uuid {res_uuid}")
			
 
				+
			
 
				+                            # 存储信息到details_list中（根据需要调整存储格式）
			
 
				+                            page_details_list.append({
			
 
				+                                "page": page_num,
			
 
				+                                "text": span_text,
			
 
				+                                "font": font_name,
			
 
				+                                "res_uuid": res_uuid,
			
 
				+                                "size": font_size,
			
 
				+                                "color": font_color,
			
 
				+                                "bbox": list(span_rect),
			
 
				+                                "type": "text"
			
 
				+                            })
			
 
				+
			
 
				+                for image_index, img_info in enumerate(image_list):
			
 
				+                    # 解析图片信息
			
 
				+                    xref = img_info[0]
			
 
				+                    base_image = doc.extract_image(xref)
			
 
				+
			
 
				+                    image_data = base_image["image"]  # 图片数据
			
 
				+                    res_uuid = str(uuid1())
			
 
				+
			
 
				+                    img_io = io.BytesIO(image_data)
			
 
				+                    res_uuid_map["img"][res_uuid] = img_io
			
 
				+                    image_type = base_image["ext"]  # 图片类型
			
 
				+                    smask = base_image["smask"]  # 图片类型
			
 
				+                    xres = base_image["xres"]  # 图片类型
			
 
				+                    yres = base_image["yres"]  # 图片类型
			
 
				+                    width = base_image["width"]  # 图片宽度
			
 
				+                    height = base_image["height"]  # 图片高度
			
 
				+
			
 
				+
			
 
				+
			
 
				+                    # 计算坐标（左下角和右上角）
			
 
				+                    x0, y0, x1, y1 = xres, yres,xres+width,yres+height
			
 
				+                    print(
			
 
				+                        f"Page: {page_num}, image_type: '{image_type}',x0{x0}, y0{y0}, x1{x1}, y1{y1}  ")
			
 
				+                    page_details_list.append({
			
 
				+                        "page": page_num,
			
 
				+                        "index": image_index,
			
 
				+                        "x0": x0,
			
 
				+                        "y0": y0,
			
 
				+                        "x1": x1,
			
 
				+                        "y1": y1,
			
 
				+                        "bbox": [x0,y0,width,height],
			
 
				+                        "width": width,
			
 
				+                        "height": height,
			
 
				+                        "res_uuid": res_uuid,
			
 
				+                        "image_type": image_type,
			
 
				+                        "type": "img"
			
 
				+                    })
			
 
				+
			
 
				+                details_list.append(page_details_list)
			
 
				+        # print("details_list",details_list)
			
 
				+        return details_list, res_uuid_map
			
 
				+    def to_img(self, buffer_pdf):
			
 
				+        """pdf2img"""
			
 
				+        pix_list = []
			
 
				+        pdfDoc = fitz.open(stream=buffer_pdf)
			
 
				+        for pg in range(pdfDoc.page_count):
			
 
				+            page = pdfDoc[pg]
			
 
				+            rotate = int(0)
			
 
				+            # 每个尺寸的缩放系数为1.3，这将为我们生成分辨率提高2.6的图像。
			
 
				+            # 此处若是不做设置，默认图片大小为：792X612, dpi=96
			
 
				+            zoom_x = 1.33333333 #(1.33333333-->1056x816)   (2-->1584x1224)
			
 
				+            zoom_y = 1.33333333
			
 
				+            # zoom_x,zoom_y = (1,1)
			
 
				+            mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
			
 
				+            pix = page.get_pixmap(matrix=mat, alpha=False)
			
 
				+
			
 
				+
			
 
				+            pix_list.append(pix)
			
 
				+        return pix_list
			
 
				+           
			
 
				+            
			
 
				+            
			
 
				+    def get_size(self):
			
 
				+        pass
			
 
				+    
			
 
				+def coast_time(func):
			
 
				+    '''
			
 
				+    计算对象执行耗时
			
 
				+    '''
			
 
				+    def fun(*agrs, **kwargs):
			
 
				+        t = time.perf_counter()
			
 
				+        result = func(*agrs, **kwargs)
			
 
				+        print(f'function {func.__name__} coast time: {time.perf_counter() - t:.8f} s')
			
 
				+        return result
			
 
				+    return fun
			
 
				+
			
 
				+
			
 
				+class BaseInit:
			
 
				+    '''
			
 
				+    解析pdf所需的基本信息
			
 
				+    '''
			
 
				+
			
 
				+    def __init__(self, pdf_path, output_path):
			
 
				+
			
 
				+        self.file_path = pdf_path
			
 
				+        self.output_path = output_path
			
 
				+        # file_name
			
 
				+        self.file_name = os.path.basename(self.file_path)
			
 
				+        # file_type
			
 
				+        self.fileType = os.path.splitext(self.file_path)[-1]
			
 
				+        # no suffix
			
 
				+        self.file_no_suffix = self.file_name[:-len(self.fileType)]
			
 
				+        self.uuidChars = tuple(list(string.ascii_letters) + list(range(10)))
			
 
				+        # 表格占位、分割符
			
 
				+        self.divide = ':'
			
 
				+        self.solid = ''
			
 
				+        # 初始化整个过程需要创建的中间目录
			
 
				+        # iou 占比
			
 
				+        self.iou_rate = 0.001
			
 
				+        self.init_file()
			
 
				+
			
 
				+    def init_file(self):
			
 
				+        """
			
 
				+        初始化项目过程中需要创建的文件夹
			
 
				+        """
			
 
				+        self.image_folder_path = os.path.join(self.output_path, 'pdf_img_save')
			
 
				+        self.json_folder_path = os.path.join(self.output_path, 'json')
			
 
				+        self.ocr_result_path = os.path.join(self.json_folder_path, self.file_no_suffix + '.json')
			
 
				+        # 后面还有txt..., 目前的流程先需要5个
			
 
				+        for path in [self.image_folder_path, self.json_folder_path]:
			
 
				+            if not os.path.exists(path):
			
 
				+                os.makedirs(path)
			
 
				+
			
 
				+    def genShortId(self, length=12):
			
 
				+        """
			
 
				+        :params length: 默认随机生成的uuid长度
			
 
				+        """
			
 
				+        uuid = str(uuid1()).replace('-', '')
			
 
				+        result = ''
			
 
				+        for i in range(0, 8):
			
 
				+            sub = uuid[i * 4: i * 4 + 4]
			
 
				+            x = int(sub, 16)
			
 
				+            result += str(self.uuidChars[x % 0x3E])
			
 
				+        return result + ''.join(random.sample(uuid, length - 8))
			
 
				+
			
 
				+
			
 
				+class PageInfo(BaseInit):
			
 
				+    '''
			
 
				+    记录每页中的 图片和表格信息
			
 
				+    '''
			
 
				+    __page_image = {}
			
 
				+    __page_table = {}
			
 
				+
			
 
				+    @classmethod
			
 
				+    def add_image(cls, page_num, image):
			
 
				+        if not cls.__page_image.get(page_num):
			
 
				+            cls.__page_image[page_num] = []
			
 
				+        cls.__page_image[page_num].append(image)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def add_table(cls, page_num, table):
			
 
				+        if not cls.__page_table.get(page_num):
			
 
				+            cls.__page_table[page_num] = []
			
 
				+        cls.__page_table[page_num].append(table)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_image(cls, page_num):
			
 
				+        return cls.__page_image.get(page_num, [])
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_table(cls, page_num):
			
 
				+        return cls.__page_table.get(page_num, [])
			
 
				+
			
 
				+    @classmethod
			
 
				+    def save_image(cls, output_path, file):
			
 
				+        '''
			
 
				+        保存图片至本地
			
 
				+        :param output:
			
 
				+        :return:
			
 
				+        '''
			
 
				+        file = file.split('.')[0]
			
 
				+        for images in cls.__page_image.values():
			
 
				+            for image in images:
			
 
				+                iamge_content = image['objContent']
			
 
				+                name = image['name']
			
 
				+                img_dir = os.path.join(output_path, 'page_img_save')
			
 
				+                img_path = os.path.join(img_dir, file + '_' + name + '.jpg')
			
 
				+                if not os.path.exists(img_dir):
			
 
				+                    os.mkdir(img_dir)
			
 
				+                with open(img_path, 'wb') as fp:
			
 
				+                    fp.write(iamge_content)
			
 
				+
			
 
				+
			
 
				+class ParseFile(PageInfo):
			
 
				+
			
 
				+    def __init__(self, pdf_path, output_path, table_type='v2', is_save=True):
			
 
				+        super().__init__(pdf_path, output_path)
			
 
				+        print('初始化 pdf 对象：{}'.format(self.file_path))
			
 
				+        self.is_save = is_save
			
 
				+        self.table_type = table_type
			
 
				+        # 第一版结果列表： 行 表分开
			
 
				+        self.page_result_list = []
			
 
				+        # 第二版结果列表： 行表合并
			
 
				+        self.combine_page_result_list = []
			
 
				+
			
 
				+    @coast_time
			
 
				+    def get_result(self):
			
 
				+        self.load_pdf()
			
 
				+        result = self.parse_pdf()
			
 
				+        self.ocr_result = result
			
 
				+        print(f'解析完成：共 {len(result)} 页  表格类型： {self.table_type}')
			
 
				+        return result
			
 
				+
			
 
				+    def load_pdf(self):
			
 
				+        self.fitz_doc = fitz.open(self.file_path, filetype='pdf')
			
 
				+        # self.pdfplum_doc_pages = pdfplumber.open(self.file_path).pages
			
 
				+        # assert len(self.fitz_doc) == len(self.pdfplum_doc_pages)
			
 
				+
			
 
				+    def parse_pdf(self):
			
 
				+        for page_no, fitz_doc in enumerate(self.fitz_doc):
			
 
				+            # 测试
			
 
				+            # if page_no != 25:
			
 
				+            #     continue
			
 
				+            self.height = fitz_doc.get_text('dict')['height']
			
 
				+            self.width = fitz_doc.get_text('dict')['width']
			
 
				+            # 聚合fitz页面解析的字符, 行, 块信息
			
 
				+            line_list = self.group_block(page_no, fitz_doc)
			
 
				+            # 获取页面表格信息
			
 
				+            table_list = self.extract_table(page_no, self.pdfplum_doc_pages[page_no])
			
 
				+            # 计算表格行列合并信息
			
 
				+            table_list = list(CalcTableRL(table_list).run())
			
 
				+            # 获取页面图片信息
			
 
				+            image_list = self.get_image(page_no)
			
 
				+            # 构造每页最终返回结果，
			
 
				+            page_result = self.construct_final_result(line_list, page_no, image_list, table_list)
			
 
				+
			
 
				+            if self.table_type == 'v2':
			
 
				+                # 合并成ocr所需格式：表格合并至行列表
			
 
				+                combine_page_result_list = self.combine_table_v2(page_result)
			
 
				+                page_result = self.construct_final_result(combine_page_result_list, page_no, image_list, table_list)
			
 
				+
			
 
				+            self.page_result_list.append(page_result)
			
 
				+            if page_no and  page_no % 10 == 0:
			
 
				+                print(f'解析前 {page_no} 页完成')
			
 
				+        final_result_list = copy.deepcopy(self.page_result_list)
			
 
				+        # 转换为符合ocr解析格式
			
 
				+        if self.table_type == 'v2':
			
 
				+            final_result_list = self.reform_ocr_result(final_result_list)
			
 
				+        # 2023/09/26 保存之前加入 contIndex 给后续 抽取模型使用
			
 
				+        for page_num, page in enumerate(final_result_list):
			
 
				+            if not page.get('lineList'):
			
 
				+                break
			
 
				+            contIndex = {}
			
 
				+            for line in page['lineList']:
			
 
				+                line_bak = dict(copy.copy(line))
			
 
				+                line_bak["objType_postpreprocess"] = f"{line_bak.get('objType','textLine')}_postpreprocess"
			
 
				+                contIndex[line_bak["lineId"]] = line_bak
			
 
				+            
			
 
				+            page["contIndex"] = contIndex
			
 
				+            for line in page['lineList']:
			
 
				+                print(page_num, line['objType'], line['objContent'])
			
 
				+        # 保存至本地
			
 
				+        if self.is_save:
			
 
				+            self.save_result(final_result_list)
			
 
				+        for page_num, page in enumerate(final_result_list):
			
 
				+            for line in page['lineList']:
			
 
				+                print(page_num, line['objType'], line['objContent'])
			
 
				+        return final_result_list
			
 
				+
			
 
				+    def combine_table_v2(self, page_result):
			
 
				+        lineList = page_result['lineList']
			
 
				+        table_list = page_result['table_list']
			
 
				+        # 先进行表格行、非表格行划分 减少后续操作的时间杂度
			
 
				+        __notable_lines, __all_table_lines = self.filter_table_line(lineList, table_list)
			
 
				+        notable_lines, all_table_lines = copy.deepcopy(__notable_lines), copy.deepcopy(__all_table_lines)
			
 
				+        del __notable_lines, __all_table_lines, lineList
			
 
				+        # 整合
			
 
				+        combine_page_result_list = self.combine_table_with_line(notable_lines, all_table_lines, table_list)
			
 
				+        return combine_page_result_list
			
 
				+
			
 
				+    def filter_table_line(self, lineList, table_list):
			
 
				+        '''
			
 
				+        筛选出属于表格的行、在 __notable_lines 属于表格的位置插庄 方便后续补全
			
 
				+        __notable_lines： 非表格的行
			
 
				+        __all_table_lines：属于表格的行
			
 
				+        '''
			
 
				+        __notable_lines = []
			
 
				+        __all_table_lines = []
			
 
				+        for table_info in table_list:
			
 
				+            table_bbox = table_info['objPos']
			
 
				+            # 属于当前表格的所有行
			
 
				+            __sub_table_lines = []
			
 
				+            is_iter_table = False
			
 
				+            while lineList:
			
 
				+                line = lineList.pop(0)
			
 
				+                line_bbox = line['objPos']
			
 
				+                # 空表格误判：行Y坐标已经超过表范围导致后续全都识别不到
			
 
				+                table_y, line_y = table_bbox[3], line_bbox[1]
			
 
				+                if line_y >= table_y:
			
 
				+                    lineList.insert(0, line)
			
 
				+                    break
			
 
				+                iou = self.count_iou(table_bbox, line_bbox)
			
 
				+                # 非表格区域
			
 
				+                if iou > 0:
			
 
				+                    __sub_table_lines.append(line)
			
 
				+                    # 首次匹配到表格行
			
 
				+                    if not is_iter_table:
			
 
				+                        is_iter_table = True
			
 
				+                        # 插入标记
			
 
				+                        __notable_lines.append('table')
			
 
				+                elif iou <= 0 and not is_iter_table:
			
 
				+                    __notable_lines.append(line)
			
 
				+                # 当前表格判断结束
			
 
				+                elif iou <= 0 and is_iter_table:
			
 
				+                    lineList.insert(0, line)
			
 
				+                    line_index, flag = self.more_judge(table_bbox, lineList)
			
 
				+                    if flag:
			
 
				+                        # 跳至index位置继续后续判断
			
 
				+                        # more_lines = copy.deepcopy()
			
 
				+                        __notable_lines.extend(lineList[:line_index])
			
 
				+                        lineList = lineList[line_index:]
			
 
				+                    else:
			
 
				+                        break
			
 
				+            __all_table_lines.append(__sub_table_lines)
			
 
				+        # 表格遍历替换完毕, 合并剩下的 page_words
			
 
				+        if lineList:
			
 
				+            __notable_lines.extend(lineList)
			
 
				+        return __notable_lines, __all_table_lines
			
 
				+
			
 
				+    def more_judge(self, table_bbox, lineList, max_judge=6):
			
 
				+        '''
			
 
				+        判断后续行列表是否还存在属于当前表格的行
			
 
				+        对于表格、行界限不明显的额外判断 如： 页面分栏、表格不全
			
 
				+        :return 是否存在 True | False
			
 
				+        '''
			
 
				+        # 往后多判断 max_judge 行
			
 
				+        if len(lineList) < max_judge:
			
 
				+            judge_lines = lineList
			
 
				+        else:
			
 
				+            judge_lines = lineList[:max_judge]
			
 
				+        for index, line in enumerate(judge_lines):
			
 
				+            line_bbox = line['objPos']
			
 
				+            iou = self.count_iou(table_bbox, line_bbox)
			
 
				+            if iou > 0:
			
 
				+                return index, True
			
 
				+        return index, False
			
 
				+
			
 
				+
			
 
				+    def combine_table_with_line(self, notable_lines, all_table_lines, table_list):
			
 
				+        '''
			
 
				+        将行、字符合并至对应的表格行、cell
			
 
				+        '''
			
 
				+        for table_id, table in enumerate(table_list):
			
 
				+            new_table_lines = []
			
 
				+            for table_line in table['lineList']:
			
 
				+                is_iter_table = False
			
 
				+                table_line_bbox = table_line['objPos']
			
 
				+                # 遍历每一行：全局匹配
			
 
				+                for __line in all_table_lines[table_id]:
			
 
				+                    line = copy.deepcopy(__line)
			
 
				+                    line_bbox = line['objPos']
			
 
				+                    iou = self.count_iou(table_line_bbox, line_bbox)
			
 
				+                    # 首次识别到表格， 将文本行的文本、坐标替换为表格行文本、坐标，文本行的其他信息不变
			
 
				+                    if iou > self.iou_rate and not is_iter_table:
			
 
				+                        is_iter_table = True
			
 
				+                        line['objContent'] = table_line['objContent']
			
 
				+                        line['objPos'] = table_line['objPos']
			
 
				+                        line['objType'] = 'table'
			
 
				+                        line['tableId'] = table_id
			
 
				+                        self.combine_cell_with_span(table_line, line)
			
 
				+                        line['cells'] = table_line['cells']
			
 
				+                        new_table_lines.append(line)
			
 
				+                    elif iou > self.iou_rate and is_iter_table:
			
 
				+                        self.combine_cell_with_span(table_line, line)
			
 
				+                    else:
			
 
				+                        pass
			
 
				+            if 'table' not in notable_lines or not new_table_lines:
			
 
				+                # FIX ERROR: 'table' is not in list
			
 
				+                # 处理大表格内识别到小表格的情况
			
 
				+                # 有可能的bug：如果此时有多个大表格嵌套会导致行分配和插庄个数不对等
			
 
				+                continue
			
 
				+            # 将表格行new_table_lines替换之前插庄table位置并展开
			
 
				+            table_index = notable_lines.index('table')
			
 
				+            new_notable_lines = notable_lines[:table_index]
			
 
				+            new_notable_lines.extend(new_table_lines)
			
 
				+            notable_lines = new_notable_lines + notable_lines[table_index+1:]
			
 
				+        return notable_lines
			
 
				+
			
 
				+    def combine_cell_with_span(self,table_line , text_line):
			
 
				+        '''
			
 
				+        将表格的cell内加上对应span的chars信息：解决表格合并时cell有多行导致chars顺序错乱的问题
			
 
				+        '''
			
 
				+        del_list = []
			
 
				+        for index, cell in enumerate(table_line['cells']):
			
 
				+            if not cell.get('chars'):
			
 
				+                cell['chars'] = []
			
 
				+            cell_bbox = cell['objPos']
			
 
				+            if cell_bbox is None:
			
 
				+                del_list.append(index)
			
 
				+                continue
			
 
				+            for span in  text_line['span']:
			
 
				+                span_bbox = span['bbox']
			
 
				+                iou = self.count_iou(cell_bbox, span_bbox)
			
 
				+                if iou < self.iou_rate:
			
 
				+                    continue
			
 
				+                # 为了解决一些 span 和 cell 长度不一致问题 将循环细分到每个字符chars
			
 
				+                for char in span['chars']:
			
 
				+                    char_bbox = char['bbox']
			
 
				+                    iou = self.count_iou(cell_bbox, char_bbox)
			
 
				+                    if iou > self.iou_rate:
			
 
				+                        cell['chars'].append(char)
			
 
				+                    else:
			
 
				+                        pass
			
 
				+        # 清除无效的span
			
 
				+        if len(del_list):
			
 
				+            for index, index_del in enumerate(del_list):
			
 
				+                index_del -= index
			
 
				+                del table_line['cells'][index_del]
			
 
				+
			
 
				+    def group_block(self, page_num, fitz_doc):
			
 
				+        """
			
 
				+        组合两个方法的block信息, 使每一个span内具有其每一个字符信息
			
 
				+        参考官方文档：https://pymupdf.readthedocs.io/en/latest/textpage.html#textpagedict
			
 
				+        :param fitz_doc:
			
 
				+        :return: total_info
			
 
				+        """
			
 
				+        line_count = 0
			
 
				+        total_line_list = []
			
 
				+        # char_blocks 最小粒度为每一个字符
			
 
				+        char_blocks = fitz_doc.get_text('rawdict')['blocks']
			
 
				+        # block_blocks 最小粒度为每行中的span
			
 
				+        block_blocks = fitz_doc.get_text('dict')['blocks']
			
 
				+        # 先进行文本块排序
			
 
				+        char_blocks.sort(key=lambda x: [int(x['bbox'][1]), int(x['bbox'][0])])
			
 
				+        block_blocks.sort(key=lambda x: [int(x['bbox'][1]), int(x['bbox'][0])])
			
 
				+        # 分组聚合
			
 
				+        group_blocks = zip(block_blocks, char_blocks)
			
 
				+        for span_blocks, char_block in group_blocks:
			
 
				+            if span_blocks['type'] == 1:
			
 
				+                # 保存其中的图片
			
 
				+                img_attrs = self.deal_image(page_num, line_count, span_blocks)
			
 
				+                self.add_image(page_num, img_attrs)
			
 
				+                continue
			
 
				+            for line_index, line in enumerate(span_blocks['lines']):
			
 
				+                line['text'] = ''
			
 
				+                line['chars'] = []
			
 
				+                line['span'] = []
			
 
				+                # 减少时间复杂度，在此处合并每一行
			
 
				+                # 合并每一行，并附上行内每一个字符的信息
			
 
				+                for span_index, span in enumerate(line['spans']):
			
 
				+                    span['text'] = span['text'].replace(' ', '').strip()
			
 
				+                    if not span['text']:
			
 
				+                        continue
			
 
				+                    # 给span_blocks中的span加上char_block的chars信息
			
 
				+                    span_chars = char_block['lines'][line_index]['spans'][span_index]['chars']
			
 
				+                    span_chars = [char for char in span_chars if char['c'].strip()]
			
 
				+                    line['text'] += span['text']
			
 
				+                    line['chars'].extend(span_chars)
			
 
				+                    line['span'].append({'bbox': span['bbox'], 'chars': span_chars,'text': span['text']})
			
 
				+                if not line['text']:
			
 
				+                    continue
			
 
				+                # 构造每行内部的数据结构
			
 
				+                line_info = self.construct_line_info(line['text'], line['bbox'], line['span'], line['chars'],
			
 
				+                                                     line_count, page_num)
			
 
				+                total_line_list.append(line_info)
			
 
				+                line_count += 1
			
 
				+        return total_line_list
			
 
				+
			
 
				+    def extract_table(self, page_no, plum_page):
			
 
				+        '''
			
 
				+        提取页面所有表格
			
 
				+        :param page_no:
			
 
				+        :param plum_page:
			
 
				+        :return:
			
 
				+        '''
			
 
				+        table_list = []
			
 
				+        for table in plum_page.find_tables():
			
 
				+            # 获取当前表格的边界定位
			
 
				+            table_line_list = self.merge_table_row(table)
			
 
				+            if not table_line_list:
			
 
				+                continue
			
 
				+            table_info = self.deal_table(page_no, table.bbox, table_line_list)
			
 
				+            table_list.append(table_info)
			
 
				+            # 将表格信息加入全局变量 | 此处有点有点冗余
			
 
				+            self.add_table(page_no, table_info)
			
 
				+        return table_list
			
 
				+
			
 
				+    def merge_table_row(self, table):
			
 
				+        '''
			
 
				+        表格cell 按行合并
			
 
				+        :param table:
			
 
				+        :return: [({line_text}, {line_bbox}), ...]
			
 
				+        '''
			
 
				+        table_line_list = []
			
 
				+        for item, row in zip(table.extract(), table.rows):
			
 
				+            # 表格每行预处理
			
 
				+            table_line = self.divide.join([self.clear_text(txt) for txt in item])
			
 
				+            # 判断当前行是否为空
			
 
				+            __line = self.clear_text(table_line).replace(' ', '')
			
 
				+            if not __line:
			
 
				+                continue
			
 
				+            table_line_list.append((table_line, row.bbox, zip(item, row.cells)))
			
 
				+        return table_line_list
			
 
				+
			
 
				+    def clear_text(self, txt, retrans=False):
			
 
				+
			
 
				+        if retrans:
			
 
				+            txt = txt.replace(self.solid, '').replace(self.divide, '')
			
 
				+        else:
			
 
				+            # 空列替换为占位符
			
 
				+            txt = txt if txt else self.solid
			
 
				+        return str(txt).replace('\n', '').replace(' ', '')
			
 
				+
			
 
				+    def deal_table(self, page_no, table_bbox, table_line_list):
			
 
				+        '''
			
 
				+        对表格做结构转换
			
 
				+        :param page_no:
			
 
				+        :param table_bbox:
			
 
				+        :param table_line_list:
			
 
				+        :return:
			
 
				+        '''
			
 
				+        table_first_line = self.clear_text(table_line_list[0][0], retrans=True)
			
 
				+        table_id = '{0}_{1}_'.format(page_no, table_first_line) + self.genShortId()
			
 
				+        lineList = [{
			
 
				+            'objContent': line[0],
			
 
				+            'objPos': line[1],
			
 
				+            'cells': self.deal_table_cell(line[2])
			
 
				+        } for line in table_line_list]
			
 
				+        table_info = {
			
 
				+            'tableId': table_id,
			
 
				+            'name': table_id,
			
 
				+            'objPos': table_bbox,
			
 
				+            'lineList': lineList,
			
 
				+        }
			
 
				+        return table_info
			
 
				+
			
 
				+    def deal_table_cell(self, cells):
			
 
				+        return [{"objContent": self.clear_text(text), "objPos": box} for text, box in cells]
			
 
				+
			
 
				+    def deal_image(self, page_num, name, img_attrs):
			
 
				+        '''
			
 
				+        对image做结构转换
			
 
				+        :param page_num:
			
 
				+        :param name:
			
 
				+        :param img_attrs:
			
 
				+        :return:
			
 
				+        '''
			
 
				+        image_id = '{0}_{1}_'.format(page_num, name) + self.genShortId()
			
 
				+        img_info = {
			
 
				+            'imageId': image_id,
			
 
				+            'name': image_id,  # 暂时以图片所在页面的行数命名
			
 
				+            'objPos': img_attrs['bbox'],
			
 
				+            'ext': img_attrs['ext'],
			
 
				+            'objContent': img_attrs['image'],
			
 
				+            'size': img_attrs['size']
			
 
				+        }
			
 
				+        return img_info
			
 
				+
			
 
				+    def deal_chars(self, line_num, lineId, chars):
			
 
				+        '''
			
 
				+        对chars做结构转换
			
 
				+        :param line_num:
			
 
				+        :param lineId:
			
 
				+        :param chars:
			
 
				+        :return:
			
 
				+        '''
			
 
				+        num_count = 0
			
 
				+        char_list = []
			
 
				+        for char in chars:
			
 
				+            if not char['c'].strip():
			
 
				+                continue
			
 
				+            char_dict = {
			
 
				+                'lineId': lineId,
			
 
				+                'charId': 'char_' + str(line_num) + '_' + str(num_count) + '_' + self.genShortId(),
			
 
				+                'objContent': char['c'],
			
 
				+                'objPos': char['bbox']
			
 
				+            }
			
 
				+            char_list.append(char_dict)
			
 
				+            num_count += 1
			
 
				+        return char_list
			
 
				+
			
 
				+    def construct_line_info(self, text, rect, span, chars, count, pageNo, objType='textLine'):
			
 
				+        '''
			
 
				+        对每行做结构转换
			
 
				+        # x, y, h, w = rect[0], rect[1], rect[3] - rect[1], rect[2] - rect[0]
			
 
				+        '''
			
 
				+        lineId = 'line_' + str(pageNo) + '_' + str(count) + '_' + self.genShortId()
			
 
				+        chars = self.deal_chars(count, lineId, chars)
			
 
				+        return OrderedDict({
			
 
				+            'lineNo': count,
			
 
				+            'lineId': lineId,
			
 
				+            'objType': objType,
			
 
				+            'objContent': re.sub(r'\s', '', text),
			
 
				+            'chars': chars,
			
 
				+            'objPos': rect,
			
 
				+            'span': span
			
 
				+        })
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def rect_format(bbox):
			
 
				+        '''
			
 
				+        数据坐标转换 x1, y1, x2, y2 >> y1, x1 h, w
			
 
				+        :param rect: [x1, y1, x2, y2]
			
 
				+        :return: [y, x, h, w]
			
 
				+        '''
			
 
				+        y, x, h, w = bbox[1], bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0]
			
 
				+        return [y, x, h, w]
			
 
				+
			
 
				+    def count_iou(self, RecA, RecB):
			
 
				+        '''
			
 
				+        计算边框交并比
			
 
				+        左上边界坐标为Ax0, Ay0, Bx0, By0
			
 
				+        右下边界坐标为Ax1, Ay1, Bx1, By1
			
 
				+        交集面积计算为：
			
 
				+            M = min(Ax1, Bx1) - max(Ax0, Bx0)
			
 
				+            H = min(Ay1, By1) - max(Ay0, By0)
			
 
				+        # 当前表格的边界信息
			
 
				+        left_x, top_y, right_x, botm_y： table_box_info[0], table_box_info[1], table_box_info[2], table_box_info[3]
			
 
				+        '''
			
 
				+        M = min(RecB[2], RecA[2]) - max(RecB[0], RecA[0])
			
 
				+        H = min(RecB[3], RecA[3]) - max(RecB[1], RecA[1])
			
 
				+
			
 
				+        # 计算交集部分面积
			
 
				+        interArea = max(0, M) * max(0, H)
			
 
				+
			
 
				+        # 计算两个边框的面积
			
 
				+        RecA_Area = (RecA[2] - RecA[0]) * (RecA[3] - RecA[1])
			
 
				+        RecB_Area = (RecB[2] - RecB[0]) * (RecB[3] - RecB[1])
			
 
				+        # 计算IOU
			
 
				+        iou = interArea / float(RecA_Area + RecB_Area - interArea)
			
 
				+        return iou
			
 
				+
			
 
				+    def construct_final_result(self, line_list, pageNo, image_list=[], table_list=[]):
			
 
				+        '''
			
 
				+        每页转换为最终数据结构
			
 
				+        :param line_list: ocr每行结果
			
 
				+        :param pageNo: 页码
			
 
				+        :param image_list:
			
 
				+        :param table_list:
			
 
				+        :return: type: Dict
			
 
				+        '''
			
 
				+        document_id = 'v1' + '_' + self.file_no_suffix + '_' + self.genShortId()
			
 
				+        return OrderedDict({
			
 
				+            'pageNo': pageNo,
			
 
				+            'docID': document_id,
			
 
				+            'page_info':{'size': [self.width, self.height]},
			
 
				+            'lineList': line_list,
			
 
				+            'image_list': image_list if image_list else [],
			
 
				+            'table_list': table_list if table_list else []
			
 
				+        })
			
 
				+
			
 
				+    def save_result(self, final_result_list):
			
 
				+        '''
			
 
				+        保存结果数据至本地
			
 
				+        '''
			
 
				+        if self.table_type == 'v2':
			
 
				+            with open(self.ocr_result_path, 'w', encoding='utf-8') as f:
			
 
				+                json.dump(final_result_list, f, indent=4, ensure_ascii=False)
			
 
				+        else:
			
 
				+            with open(self.ocr_result_path, 'w', encoding='utf-8') as f:
			
 
				+                json.dump(self.page_result_list, f, cls=MyEncoder, indent=4, ensure_ascii=False)
			
 
				+
			
 
				+    def reform_ocr_result(self, final_result_list):
			
 
				+        """
			
 
				+        对返回的结果最最终处理 并 重新定义行号排序
			
 
				+        :param final_result_list: 本地解析和ocr解析的合并结果
			
 
				+        """
			
 
				+        for result_list in final_result_list:
			
 
				+            del result_list['image_list']
			
 
				+            del result_list['table_list']
			
 
				+            lineList = result_list['lineList']
			
 
				+            for num, line in enumerate(lineList):
			
 
				+                # 重写行号和行ID
			
 
				+                line['lineNo'] = str(num)
			
 
				+                line_split = line['lineId'].split('_')
			
 
				+                line_split[-2] = str(num)
			
 
				+                line['lineId'] = '_'.join(line_split)
			
 
				+                # 转换坐标格式
			
 
				+                obj_type = line['objType']
			
 
				+                # 计算每一个字相对于当前行想x，y 的偏移量
			
 
				+                offset_x_list, offset_y_list = self.coord_offset(line, obj_type)
			
 
				+                line['objPos'] = self.rect_format(line['objPos'])
			
 
				+                line['objPos'].append(offset_x_list)
			
 
				+                line['chars_offset'] = [offset_x_list, offset_y_list]
			
 
				+                if line.get('chars'):
			
 
				+                    del line['chars']
			
 
				+                if obj_type == 'table' and line.get('span'):
			
 
				+                    del line['span']
			
 
				+        return final_result_list
			
 
				+
			
 
				+    def coord_offset(self, line, obj_type='textLine'):
			
 
				+        '''
			
 
				+        计算每个字符的左上角 相对行左上角位置的偏移量
			
 
				+        @obj_type: textLine | table
			
 
				+        '''
			
 
				+        offset_x_list = []
			
 
				+        offset_y_list = []
			
 
				+        line_x, line_y = line['objPos'][0], line['objPos'][1]
			
 
				+        if obj_type == 'textLine':
			
 
				+            for span in line['span']:
			
 
				+                self.all_rect_format(span)
			
 
				+                for char in span['chars']:
			
 
				+                    char_x, char_y = char['bbox'][0], char['bbox'][1]
			
 
				+                    offset_x_list.append(char_x - line_x)
			
 
				+                    offset_y_list.append(char_y - line_y)
			
 
				+                    self.all_rect_format(char)
			
 
				+        else:
			
 
				+            __cells = []
			
 
				+            for num, _cell in enumerate(line['cells']):
			
 
				+                cell = copy.deepcopy(_cell)
			
 
				+                self.all_rect_format(cell)
			
 
				+                for char in cell['chars']:
			
 
				+                    char_x, char_y = char['bbox'][0], char['bbox'][1]
			
 
				+                    offset_x_list.append(char_x - line_x)
			
 
				+                    offset_y_list.append(char_y - line_y)
			
 
				+                    self.all_rect_format(char)
			
 
				+                __cells.append(cell)
			
 
				+            line['cells'] = __cells
			
 
				+        return offset_x_list, offset_y_list
			
 
				+
			
 
				+    def all_rect_format(self, obj):
			
 
				+        '''
			
 
				+        将所有格式转换为ocr所需格式
			
 
				+        '''
			
 
				+        if 'chars' in obj:
			
 
				+            if obj.get('text'):
			
 
				+                obj['objContent'] = obj['text']
			
 
				+                del obj['text']
			
 
				+            if obj.get('objPos'):
			
 
				+                obj['objPos'] = self.rect_format(obj['objPos'])
			
 
				+            elif obj.get('bbox'):
			
 
				+                obj['objPos'] = self.rect_format(obj['bbox'])
			
 
				+                del obj['bbox']
			
 
				+        else:
			
 
				+            obj['objContent'] = obj['c']
			
 
				+            obj['objPos'] = self.rect_format(obj['bbox'])
			
 
				+            del obj['c']
			
 
				+            del obj['bbox']
			
 
				+
			
 
				+class CalcTableRL:
			
 
				+    '''
			
 
				+    还原表格虚线 计算表格行列合并信息
			
 
				+    输入目标表格结构信息：必须包含所有的cell坐标
			
 
				+    在目标表格结构cell上加上row_start_end, col_start_end属性
			
 
				+    '''
			
 
				+    def __init__(self, table_info):
			
 
				+        self.table_info = table_info
			
 
				+
			
 
				+    def run(self):
			
 
				+        if isinstance(self.table_info, list):
			
 
				+            for table_info in self.table_info:
			
 
				+                table_info = self.add_table_property(table_info)
			
 
				+                yield table_info
			
 
				+        else:
			
 
				+            table_info = self.add_table_property(self.table_info)
			
 
				+            yield table_info
			
 
				+    def add_table_property(self, table_info):
			
 
				+        '''
			
 
				+        表格结构增加行列合并信息:
			
 
				+        cell['col_start_end'] = (col_start, col_end)
			
 
				+        cell['row_start_end'] = (row_start, row_end)
			
 
				+        '''
			
 
				+        # 分别得到所有排序好的行列坐标
			
 
				+        set_x, set_y = self.collect_table_coord(table_info)
			
 
				+        # 排序 后的set_x，set_y 坐标集合就是最小粒度表格
			
 
				+        list_x, list_y = sorted(set_x), sorted(set_y)
			
 
				+        for line in table_info['lineList']:
			
 
				+            for cell in line['cells']:
			
 
				+                if cell['objPos'] == None:
			
 
				+                    continue
			
 
				+                x1, y1, x2, y2 = cell['objPos']
			
 
				+                # 查找坐标点在虚线表格中对应的位置
			
 
				+                col_start = list_x.index(x1)
			
 
				+                col_end = list_x.index(x2)
			
 
				+                row_start = list_y.index(y1)
			
 
				+                row_end = list_y.index(y2)
			
 
				+                cell['col_start_end'] = (col_start, col_end)
			
 
				+                cell['row_start_end'] = (row_start, row_end)
			
 
				+                # print(f"{cell['objContent']} 属于行：{cell['row_start_end']} 属于列：{cell['col_start_end']}")
			
 
				+        return table_info
			
 
				+
			
 
				+    def collect_table_coord(self, table_info):
			
 
				+        '''
			
 
				+        获取所有x, y坐标点
			
 
				+        传入单个表格信息，提取出其中所有cell的x1, y1, x2, y2坐标点 去重
			
 
				+        :param table_info:
			
 
				+        :return: set(x), set(y)
			
 
				+        '''
			
 
				+        set_x = set()
			
 
				+        set_y = set()
			
 
				+        for line in table_info['lineList']:
			
 
				+            for cell in line['cells']:
			
 
				+                if cell['objPos'] == None:
			
 
				+                    continue
			
 
				+                x1, y1, x2, y2 = cell['objPos']
			
 
				+                set_x.add(x1)
			
 
				+                set_x.add(x2)
			
 
				+                set_y.add(y1)
			
 
				+                set_y.add(y2)
			
 
				+        return set_x, set_y
			
 
				+
			
 
				+
			
 
				+
			
 
				+def pdf_ocr(pdf_path, output_path, table_type='v2', is_save=True):
			
 
				+    '''
			
 
				+    简单封装, 方便调用和多线程
			
 
				+    '''
			
 
				+    pdf = ParseFile(pdf_path, output_path, table_type, is_save)
			
 
				+    pdf.get_result()
			
 
				+    return pdf
			
 
				+
			
 
				+# ---------------------------以下是测试案列-----------------------------------
			
 
				+
			
 
				+@coast_time
			
 
				+def test_dir():
			
 
				+    for root in os.walk(r'E:\workplace\cjhx_test\创金和信\pdf2json\input\all_test'):
			
 
				+        dir, files = root[0], root[2]
			
 
				+        for file in files:
			
 
				+            if 'test.pdf' not in file:
			
 
				+                continue
			
 
				+            file_path = os.path.join(dir, file)
			
 
				+            output_dir = r'E:\workplace\cjhx_test\创金和信\pdf2json\file_data\all_test'
			
 
				+            pdf_ocr_result = pdf_ocr(file_path, output_dir)
			
 
				+
			
 
				+@coast_time
			
 
				+def test_single():
			
 
				+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\测试足够复杂的表格解析.pdf'
			
 
				+    file_path = r'/home/yhocr/extractor/3f195fba-0916-4d74-b956-bf3bcadc77f2/20220913-浙江省贰号职业年金计划银华资产组合2022年二季度管理费用支付指令.pdf'
			
 
				+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\公开募集基金销售支付结算机构名录(2022年9月)(1).pdf'
			
 
				+    # file_path = r'C:\Users\Administrator\Documents\WeChat Files\wxid_x36dhycno4s121\FileStorage\File\2022-11\20210928-ZL001-西部利得天添鑫货币B-申购5000万-确认书.pdf'
			
 
				+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\2-信息系统部2021年大数据平台系统维护服务--工作记录表和考核表2021Q3-原版.pdf'
			
 
				+    output_dir = r'/home/yhocr/extractor/3f195fba-0916-4d74-b956-bf3bcadc77f2/电子解析'
			
 
				+    pdf = pdf_ocr(file_path, output_dir, table_type='v2')
			
 
				+    # print(pdf.ocr_result)
			
 
				+
			
 
				+@coast_time
			
 
				+def test_thread():
			
 
				+    # 多进程
			
 
				+    from concurrent.futures import ProcessPoolExecutor
			
 
				+    pool = ProcessPoolExecutor(max_workers=8)
			
 
				+    # 多线程
			
 
				+    # from concurrent.futures import ThreadPoolExecutor
			
 
				+    # pool = ThreadPoolExecutor(max_workers=8)
			
 
				+    for root in os.walk(r'E:\workplace\daily_work\pdf2json\input\签字模板二'):
			
 
				+        dir, files = root[0], root[2]
			
 
				+        for file in files:
			
 
				+            file_path = os.path.join(dir, file)
			
 
				+            output_dir = r'E:\workplace\daily_work\pdf2json\output\签字模板二'
			
 
				+            ret = pool.submit(pdf_ocr, file_path, output_dir, table_type='v2')
			
 
				+            ret.add_done_callback(print_callback)
			
 
				+    pool.shutdown()
			
 
				+
			
 
				+def print_callback(ret):
			
 
				+    # print('ret:', ret.result())
			
 
				+    pass
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # test_dir()
			
 
				+    # test_thread()
			
 
				+    # test_single()
			
 
				+    pdf_obj = DPFParser()
			
 
				+    with open(r"F:\code\easyofd\test\test.pdf","rb") as f:
			
 
				+        pdf_bytes = f.read()
			
 
				+
			
 
				+    img_list = pdf_obj.to_img(pdf_bytes)
			
 
				+    pil_img_list = []
			
 
				+    for _img in img_list:
			
 
				+        print(_img.width,_img.height)
			
 
				+        img = Image.frombytes("RGB", [_img.width, _img.height], _img.samples)
			
 
				+        print(type(img))
			
 
				+        img.save('output_image.png')
			
 
				+      
			
 
				+    
			
--- a/format_convert/easyofd/easyofd/draw/simsun.ttc
+++ b/format_convert/easyofd/easyofd/draw/simsun.ttc
--- a/format_convert/easyofd/easyofd/ofd.py
+++ b/format_convert/easyofd/easyofd/ofd.py
@@ -0,0 +1,301 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME: F:\code\easyofd\easyofd
			
 
				+# CREATE_TIME: 2023-10-07
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# note:  ofd 基础类
			
 
				+import base64
			
 
				+import os
			
 
				+import sys
			
 
				+from io import BytesIO
			
 
				+from typing import Union
			
 
				+
			
 
				+# sys.path.insert(0, os.getcwd())
			
 
				+# sys.path.insert(0, "..")
			
 
				+
			
 
				+import fitz
			
 
				+from PIL import Image
			
 
				+from fontTools.ttLib import TTFont
			
 
				+from loguru import logger
			
 
				+
			
 
				+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../../../")
			
 
				+
			
 
				+from format_convert.easyofd.easyofd.parser_ofd import OFDParser
			
 
				+from format_convert.easyofd.easyofd.draw import DrawPDF, OFDWrite
			
 
				+
			
 
				+
			
 
				+class OFD(object):
			
 
				+    """ofd对象"""
			
 
				+
			
 
				+    def __init__(self, ):
			
 
				+        self.data = None
			
 
				+
			
 
				+    def read(self, ofd_f: Union[str, bytes, BytesIO], fmt="b64", save_xml=False, xml_name="testxml", save_dir=None):
			
 
				+        """_summary_
			
 
				+        Args:
			
 
				+            file (_type_): _description_
			
 
				+            fomat (str, optional): _description_. Defaults to "path".
			
 
				+            fomat in ("path","b64","binary")
			
 
				+        """
			
 
				+        if fmt == "path":
			
 
				+            with open(ofd_f, "rb") as f:
			
 
				+                ofd_f = str(base64.b64encode(f.read()), encoding="utf-8")
			
 
				+        elif fmt == "b64":
			
 
				+            pass
			
 
				+        elif fmt == "binary":
			
 
				+            ofd_f = str(base64.b64encode(ofd_f), encoding="utf-8")
			
 
				+        elif fmt == "io":
			
 
				+            ofd_f = str(base64.b64encode(ofd_f.getvalue()), encoding="utf-8")
			
 
				+        else:
			
 
				+            raise "fomat Error: %s" % fmt
			
 
				+
			
 
				+        self.data = OFDParser(ofd_f)(save_xml=save_xml, xml_name=xml_name, save_dir=save_dir)
			
 
				+
			
 
				+    def save(self, ):
			
 
				+        """
			
 
				+        draw ofd xml
			
 
				+        初始化一个xml 文件
			
 
				+        self.data > file
			
 
				+        """
			
 
				+        assert self.data, f"data is None"
			
 
				+
			
 
				+    def pdf2ofd(self, pdfbyte, optional_text=False):
			
 
				+        """pdf转ofd"""
			
 
				+        assert pdfbyte, f"pdfbyte is None"
			
 
				+        # logger.info(f"pdf2ofd")
			
 
				+        ofd_byte = OFDWrite()(pdfbyte, optional_text=optional_text)
			
 
				+        return ofd_byte
			
 
				+
			
 
				+    def to_pdf(self, return_need_convert_as_image=False):
			
 
				+        """return ofdbytes"""
			
 
				+
			
 
				+        assert self.data, f"data is None"
			
 
				+        # logger.info(f"to_pdf")
			
 
				+        obj = DrawPDF(self.data)
			
 
				+        result = obj()
			
 
				+        if not return_need_convert_as_image:
			
 
				+            return result
			
 
				+        else:
			
 
				+            return result, obj.page_need_to_image_dict
			
 
				+
			
 
				+    def pdf2img(self, pdfbytes):
			
 
				+
			
 
				+        image_list = []
			
 
				+
			
 
				+        doc = fitz.open(stream=pdfbytes, filetype="pdf")
			
 
				+
			
 
				+        for page in doc:
			
 
				+            rotate = int(0)
			
 
				+            zoom_x, zoom_y = 1.6, 1.6
			
 
				+            zoom_x, zoom_y = 2, 2
			
 
				+            mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
			
 
				+            pix = page.get_pixmap(matrix=mat, alpha=False)
			
 
				+            pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
			
 
				+            # image = np.ndarray((pix.height, pix.width, 3), dtype=np.uint8, buffer=pix.samples)
			
 
				+            # print(image.shape)
			
 
				+            # print(image[2])
			
 
				+            image_list.append(pil_image)
			
 
				+        # logger.info(f"pdf2img")
			
 
				+        return image_list
			
 
				+
			
 
				+    def jpg2ofd(self, imglist: list):
			
 
				+        """
			
 
				+        imglist: pil image list
			
 
				+        """
			
 
				+        ofd_byte = OFDWrite()(pil_img_list=imglist)
			
 
				+        return ofd_byte
			
 
				+
			
 
				+    def jpg2pfd(self, imglist: list):
			
 
				+        """
			
 
				+        imglist: PIL image list
			
 
				+        1 构建data 
			
 
				+        2 DrawPDF(self.data)()
			
 
				+        """
			
 
				+
			
 
				+        data = OFDParser(None).img2data(imglist)
			
 
				+        return DrawPDF(data)()
			
 
				+
			
 
				+    def to_jpg(self, format="jpg"):
			
 
				+        """
			
 
				+        return pil list
			
 
				+        """
			
 
				+        assert self.data, f"data is None"
			
 
				+        image_list = []
			
 
				+        pdfbytes = self.to_pdf()
			
 
				+        image_list = self.pdf2img(pdfbytes)
			
 
				+        return image_list
			
 
				+
			
 
				+    def del_data(self, ):
			
 
				+        """销毁self.data"""
			
 
				+        self.data = None
			
 
				+
			
 
				+    def __del__(self):
			
 
				+        del self
			
 
				+
			
 
				+    def disposal(self, ):
			
 
				+        """销毁对象"""
			
 
				+        self.__del__()
			
 
				+
			
 
				+
			
 
				+def find_similar_characters():
			
 
				+    similar_pairs = []
			
 
				+    for code in range(0x4E00, 0x9FFF):  # 遍历常见的中文字符范围
			
 
				+        char = chr(code)
			
 
				+        try:
			
 
				+            name = unicodedata.name(char)
			
 
				+            if name.startswith('CJK COMPATIBILITY IDEOGRAPH'):
			
 
				+                original_char = unicodedata.lookup(name.split()[-1])
			
 
				+                similar_pairs.append((original_char, char))
			
 
				+        except (ValueError, KeyError):
			
 
				+            continue
			
 
				+    return similar_pairs
			
 
				+
			
 
				+
			
 
				+def save_chinese_characters(output_path):
			
 
				+    with open(output_path, 'w', encoding='utf-8') as file:
			
 
				+        # 遍历更多的中文字符范围
			
 
				+        # for code in range(0x3400, 0x4DFF + 1):  # CJK Unified Ideographs Extension A
			
 
				+        #     char = chr(code)
			
 
				+        #     # if not unicodedata.category(char).startswith('P'):
			
 
				+        #     file.write(char + '\n')
			
 
				+        # for code in range(0x4E00, 0x9FFF + 1):  # 常见的中文字符范围
			
 
				+        #     char = chr(code)
			
 
				+        #     # if not unicodedata.category(char).startswith('P'):
			
 
				+        #     file.write(char + '\n')
			
 
				+        # for code in range(0xF900, 0xFAFF + 1):  # CJK Compatibility Ideographs
			
 
				+        #     char = chr(code)
			
 
				+        #     # if not unicodedata.category(char).startswith('P'):
			
 
				+        #     file.write(char + '\n')
			
 
				+        # for code in range(0x2F00, 0x2FDF + 1):  # CJK Compatibility Ideographs
			
 
				+        #     char = chr(code)
			
 
				+        #     # if not unicodedata.category(char).startswith('P'):
			
 
				+        #     file.write(char + '\n')
			
 
				+
			
 
				+        for code in range(0xF900, 0xFAD9 + 1):  # CJK Compatibility Ideographs
			
 
				+            char = chr(code)
			
 
				+            # if not unicodedata.category(char).startswith('P'):
			
 
				+            file.write(char + '\n')
			
 
				+
			
 
				+
			
 
				+def map_kangxi_to_common_characters(kangxi_start=0x2F00, kangxi_end=0x2FDF, common_start=0x4E00, common_end=0x9FFF, output_path="kangxi_to_common.txt"):
			
 
				+    with open(output_path, 'w', encoding='utf-8') as file:
			
 
				+        # 遍历康熙部首范围
			
 
				+        for kangxi_code in range(kangxi_start, kangxi_end + 1):
			
 
				+            kangxi_char = chr(kangxi_code)
			
 
				+            # 遍历常见中文字符范围
			
 
				+            for common_code in range(common_start, common_end + 1):
			
 
				+                common_char = chr(common_code)
			
 
				+                # 如果字形相同，则记录匹配
			
 
				+                if kangxi_char == common_char:
			
 
				+                    file.write(f"{kangxi_char} (Kangxi: {hex(kangxi_code)}) -> {common_char} (Common: {hex(common_code)})\n")
			
 
				+                    break  # 找到匹配后，跳出内层循环
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # ofd = OFD()
			
 
				+
			
 
				+    # p = r'D:\Project\format_conversion_maxcompute\format_convert\temp\2b42e0b44cea11f0ab9644f971944973\2b4307ae4cea11f0992a44f971944973_ofd\Doc_0\Res\19.ttf'
			
 
				+    # font = TTFont(p)  # 替换为你的TTF文件路径
			
 
				+    # print('font', font.keys())
			
 
				+    #
			
 
				+    # # 访问 GlyphOrder 表
			
 
				+    # glyph_order = font['glyf']
			
 
				+    # print("Glyph Order:", glyph_order.glyphs)
			
 
				+    #
			
 
				+    # # 访问 head 表
			
 
				+    # head = font['head']
			
 
				+    # print("Font Head:")
			
 
				+    # print(f" - Font Magic Number: {head.magicNumber}")
			
 
				+    # print(f" - Font Version: {head.fontRevision}")
			
 
				+    # print(f" - Font Flags: {head.flags}")
			
 
				+    # print(f" - Units per Em: {head.unitsPerEm}")
			
 
				+    # print(f" - Created: {head.created}")
			
 
				+    # print(f" - Modified: {head.modified}")
			
 
				+    #
			
 
				+    # # 访问 hhea 表
			
 
				+    # hhea = font['hhea']
			
 
				+    # print("Horizontal Header:")
			
 
				+    # print(f" - Ascent: {hhea.ascent}")
			
 
				+    # print(f" - Descent: {hhea.descent}")
			
 
				+    # print(f" - Line Gap: {hhea.lineGap}")
			
 
				+    #
			
 
				+    # # 访问 maxp 表
			
 
				+    # maxp = font['maxp']
			
 
				+    # print("Maximum Profile:")
			
 
				+    # print(f" - Number of Glyphs: {maxp.numGlyphs}")
			
 
				+    #
			
 
				+    # # 访问 OS/2 表
			
 
				+    # os2 = font['OS/2']
			
 
				+    # print("OS/2 and Windows Metrics:")
			
 
				+    # print(f" - Weight Class: {os2.usWeightClass}")
			
 
				+    # print(f" - Width Class: {os2.usWidthClass}")
			
 
				+    # print(f" - Type: {os2.fsType}")
			
 
				+    #
			
 
				+    # # 访问 hmtx 表
			
 
				+    # hmtx = font['hmtx']
			
 
				+    # print("Horizontal Metrics:")
			
 
				+    # for glyph_name, metrics in hmtx.metrics.items():
			
 
				+    #     print(f" - Glyph '{glyph_name}': Advance Width = {metrics[0]}, Left Side Bearing = {metrics[1]}")
			
 
				+    #
			
 
				+    # # 访问 loca 表
			
 
				+    # loca = font.get('loca')
			
 
				+    # print("Locations:")
			
 
				+    # for i, location in enumerate(loca):
			
 
				+    #     print(f" - Glyph {i}: {location}")
			
 
				+    #
			
 
				+    # # 访问 glyf 表
			
 
				+    # glyf = font.get('glyf')
			
 
				+    # for glyph_name in glyf.glyphs:
			
 
				+    #     glyph = glyf[glyph_name]
			
 
				+    #     print(f"Glyph '{glyph_name}':")
			
 
				+    #     print(f" - Number of Contours: {glyph.numberOfContours}")
			
 
				+    #     if glyph.numberOfContours > 0:
			
 
				+    #         print(f" - X Minimum: {glyph.xMin}")
			
 
				+    #         print(f" - Y Minimum: {glyph.yMin}")
			
 
				+    #         print(f" - X Maximum: {glyph.xMax}")
			
 
				+    #         print(f" - Y Maximum: {glyph.yMax}")
			
 
				+    #     else:
			
 
				+    #         print(" - No Contours")
			
 
				+    #     print()
			
 
				+    #
			
 
				+    # # 访问 name 表
			
 
				+    # name = font['name']
			
 
				+    # print("Font Name Entries:")
			
 
				+    # for record in name.names:
			
 
				+    #     print(f" - Name ID: {record.nameID}")
			
 
				+    #     print(f" - Platform ID: {record.platformID}")
			
 
				+    #     print(f" - Encoding ID: {record.platEncID}")
			
 
				+    #     print(f" - Language ID: {record.langID}")
			
 
				+    #     print(f" - Name: {record.toUnicode()}")
			
 
				+    #     print()
			
 
				+    # font.close()
			
 
				+    # print(best_cmap)
			
 
				+
			
 
				+    import unicodedata
			
 
				+    #
			
 
				+    # # 示例
			
 
				+    # text = "仁和坪镇杨柳池村⼈居环境整治项⽬终⽌"
			
 
				+    # standardized_text = unicodedata.normalize('NFD', text)
			
 
				+    # print(f"标准化后的文本: {standardized_text}")
			
 
				+
			
 
				+    # import unicodedata
			
 
				+    #
			
 
				+    #
			
 
				+    #
			
 
				+    # similar_characters = find_similar_characters()
			
 
				+    #
			
 
				+    # for pair in similar_characters:
			
 
				+    #     print(f"原始字符: {pair[0]}, 兼容字符: {pair[1]}")
			
 
				+    #
			
 
				+    # print(f"共找到 {len(similar_characters)} 对相似中文字符。")
			
 
				+
			
 
				+
			
 
				+    # 使用示例
			
 
				+    output_path = 'chinese_characters.txt'
			
 
				+    # save_chinese_characters(output_path)
			
 
				+
			
 
				+    # 获取并打印 Unicode 编码
			
 
				+    # char = '⽬'
			
 
				+    # # char = '目'
			
 
				+    # print(f"字符 '{char}' 的 Unicode 编码是: {ord(char):04X}")
			
--- a/format_convert/easyofd/easyofd/parser_ofd/__init__.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/__init__.py
@@ -0,0 +1,37 @@
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+from loguru import logger
			
 
				+from reportlab.pdfbase import pdfmetrics
			
 
				+from reportlab.pdfbase.cidfonts import UnicodeCIDFont
			
 
				+from reportlab.pdfbase.ttfonts import TTFont
			
 
				+
			
 
				+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../../../../")
			
 
				+
			
 
				+
			
 
				+# from ofd_parser import *
			
 
				+
			
 
				+
			
 
				+font_map = {"simsun.ttc":["宋体", "SWPMEH+SimSun","SimSun","SWDKON+SimSun"],
			
 
				+            'simkai.ttf':["KaiTi","楷体","SWLCQE+KaiTi","SWHGME+KaiTi","BWSimKai"],
			
 
				+            # 'STKAITI.TTF':["华文楷体 常规","STKAITI","华文楷体"],
			
 
				+            "COURI.TTF":["CourierNewPSMT","CourierNew","SWCRMF+CourierNewPSMT","SWANVV+CourierNewPSMT"],
			
 
				+            "courbd.TTF":["Courier New"],
			
 
				+            "simhei.ttf":["SimHei","hei","黑体"]
			
 
				+            }
			
 
				+pdfmetrics.registerFont(UnicodeCIDFont('STSong-Light'))
			
 
				+
			
 
				+# 初始化字体
			
 
				+for font,names in font_map.items():
			
 
				+    for name in names:
			
 
				+        try:
			
 
				+            pdfmetrics.registerFont(TTFont(name, font))
			
 
				+        except:
			
 
				+            logger.warning(f"FONT  registerFont failed {font}: {name}")
			
 
				+
			
 
				+from format_convert.easyofd.easyofd.parser_ofd.ofd_parser import OFDParser
			
 
				+__all__=["OFDParser"]
			
 
				+                                    
			
 
				+
			
 
				+
			
 
				+
			
--- a/format_convert/easyofd/easyofd/parser_ofd/file_annotation_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_annotation_parser.py
@@ -0,0 +1,145 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME:  file_annotation_parser.py
			
 
				+# CREATE_TIME: 2025/3/28 14:12
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE: 注释解析
			
 
				+import re
			
 
				+
			
 
				+from loguru import logger
			
 
				+from .file_parser_base import FileParserBase
			
 
				+
			
 
				+
			
 
				+# class AnnotationsParser(FileParserBase):
			
 
				+#     """
			
 
				+#     Parser Annotations
			
 
				+#     注释信息-总
			
 
				+#     /xml_dir/Doc_0/Pages/Page_0/Content.xml
			
 
				+#     """
			
 
				+#
			
 
				+#     def __call__(self):
			
 
				+#         info = {}
			
 
				+#         annotations_res: list = []
			
 
				+#         annotations_res_key = "ofd:Page"
			
 
				+#         self.recursion_ext(self.xml_obj, annotations_res, annotations_res_key)
			
 
				+#         # logger.debug(f"annotations_res is {annotations_res}")
			
 
				+#         if annotations_res:
			
 
				+#             for i in annotations_res:
			
 
				+#                 page_id = i.get("@PageID")
			
 
				+#                 if not page_id:
			
 
				+#                     # logger.debug(f"page_id is null ")
			
 
				+#                     continue
			
 
				+#                 file_Loc = i.get("ofd:FileLoc")
			
 
				+#                 if not file_Loc:
			
 
				+#                     # logger.debug(f"file_Loc is null ")
			
 
				+#                     continue
			
 
				+#                 info[page_id] = {
			
 
				+#                     "FileLoc": file_Loc,
			
 
				+#                 }
			
 
				+#
			
 
				+#         return info
			
 
				+#
			
 
				+#
			
 
				+# class AnnotationFileParser(FileParserBase):
			
 
				+#     """
			
 
				+#     Parser Annotation
			
 
				+#     注释类 包含 签名注释 水印注释 信息注释
			
 
				+#     """
			
 
				+#
			
 
				+#     AnnoType = {
			
 
				+#         "Watermark": {
			
 
				+#             "name": "水印",
			
 
				+#             "type": "Watermark"
			
 
				+#         },
			
 
				+#         "Link": {
			
 
				+#             "name": "链接",
			
 
				+#             "type": "Link"
			
 
				+#         }
			
 
				+#         ,
			
 
				+#         "Path": {
			
 
				+#             "name": "路径",
			
 
				+#             "type": "Path"
			
 
				+#         },
			
 
				+#         "Highlight": {
			
 
				+#             "name": "高亮",
			
 
				+#             "type": "Highlight"
			
 
				+#         },
			
 
				+#         "Stamp": {
			
 
				+#             "name": "签章",
			
 
				+#             "type": "Highlight"
			
 
				+#         }
			
 
				+#     }
			
 
				+#
			
 
				+#     def normalize_font_name(self, font_name):
			
 
				+#         """将字体名称规范化，例如 'Times New Roman Bold' -> 'TimesNewRoman-Bold'"""
			
 
				+#         # 替换空格为无，并将样式（Bold/Italic等）用连字符连接
			
 
				+#         if not isinstance(font_name, str):
			
 
				+#             return ""
			
 
				+#         normalized = font_name.replace(' ', '')
			
 
				+#         # 处理常见的样式后缀
			
 
				+#         for style in ['Bold', 'Italic', 'Regular', 'Light', 'Medium', ]:
			
 
				+#             if style in normalized:
			
 
				+#                 normalized = normalized.replace(style, f'-{style}')
			
 
				+#
			
 
				+#         # todo 特殊字体名规范 后续存在需要完善
			
 
				+#         if normalized == "TimesNewRoman":
			
 
				+#             normalized = normalized.replace("TimesNewRoman", "Times-Roman")
			
 
				+#         return normalized
			
 
				+#
			
 
				+#     def __call__(self):
			
 
				+#         info = {}
			
 
				+#         public_res: list = []
			
 
				+#         public_res_key = "ofd:Page"
			
 
				+#         self.recursion_ext(self.xml_obj, public_res, public_res_key)
			
 
				+#
			
 
				+#         if public_res:
			
 
				+#             for i in public_res:
			
 
				+#                 info[i.get("@ID")] = {
			
 
				+#                     "FontName": self.normalize_font_name(i.get("@FontName")),
			
 
				+#                     "FontNameORI": i.get("@FontName"),
			
 
				+#                     "FamilyName": self.normalize_font_name(i.get("@FamilyName")),
			
 
				+#                     "FamilyNameORI": i.get("@FamilyName"),
			
 
				+#                     "Bold": i.get("@Bold"),
			
 
				+#                     "Serif": i.get("@Serif"),
			
 
				+#                     "FixedWidth": i.get("@FixedWidth"),
			
 
				+#                     "FontFile": i.get("ofd:FontFile"),
			
 
				+#                 }
			
 
				+#         return info
			
 
				+
			
 
				+
			
 
				+class AnnotationFileParser(FileParserBase):
			
 
				+    """
			
 
				+    Annotations.xml 为doc内的根节点 包含：
			
 
				+    1 文件的路径
			
 
				+
			
 
				+    /xml_dir/Doc_0/Annotations.xml
			
 
				+    """
			
 
				+
			
 
				+    def loc2page_no(self, loc, idx):
			
 
				+        pg_no = re.search(r"\d+", loc)
			
 
				+        if pg_no:
			
 
				+            pg_no = int(pg_no.group())
			
 
				+        else:
			
 
				+            pg_no = idx
			
 
				+        return pg_no
			
 
				+
			
 
				+    def __call__(self):
			
 
				+        annot_info = {}
			
 
				+
			
 
				+        # ofd:Page 正文
			
 
				+        page: list = []
			
 
				+        page_id_map = {}
			
 
				+        page_key = "ofd:Page"
			
 
				+        self.recursion_ext(self.xml_obj, page, page_key)
			
 
				+        if page:
			
 
				+            # print('page', page)
			
 
				+            page_id_map = {
			
 
				+                i.get("@PageID"): self.loc2page_no(i.get("ofd:FileLoc"), idx)
			
 
				+                for idx, i in enumerate(page)
			
 
				+            }
			
 
				+            page = [i.get("ofd:FileLoc") if isinstance(i, dict) else i for i in page]
			
 
				+
			
 
				+        annot_info["annot_page"] = page
			
 
				+        annot_info["annot_page_id_map"] = page_id_map
			
 
				+        return annot_info
			
--- a/format_convert/easyofd/easyofd/parser_ofd/file_attachment_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_attachment_parser.py
@@ -0,0 +1,7 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME:  file_attachment_parser.py
			
 
				+# CREATE_TIME: 2025/4/9 18:52
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE:
			
--- a/format_convert/easyofd/easyofd/parser_ofd/file_content_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_content_parser.py
@@ -0,0 +1,140 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME:  file_content_parser.py
			
 
				+# CREATE_TIME: 2025/3/28 11:47
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE: 解析正文
			
 
				+from loguru import  logger
			
 
				+from .file_parser_base import FileParserBase
			
 
				+
			
 
				+
			
 
				+class ContentFileParser(FileParserBase):
			
 
				+    """
			
 
				+    Parser Contents&tpls
			
 
				+    /xml_dir/Doc_0/Doc_0/Pages/Page_0/Content.xml
			
 
				+    """
			
 
				+
			
 
				+    def fetch_cell_info(self, row, TextObject):
			
 
				+        """fetch_cell_info"""
			
 
				+        cell_d = {}
			
 
				+        cell_d = {}
			
 
				+        cell_d["ID"] = row['@ID']  # 字体
			
 
				+        # 字体字形信息
			
 
				+        if row.get("ofd:CGTransform"):
			
 
				+            Glyphs_d = {
			
 
				+                "Glyphs": row.get("ofd:CGTransform").get("ofd:Glyphs"),
			
 
				+                "GlyphCount": row.get("ofd:CGTransform").get("@GlyphCount"),
			
 
				+                "CodeCount": row.get("ofd:CGTransform").get("@CodeCount"),
			
 
				+                "CodePosition": row.get("ofd:CGTransform").get("@CodePosition")
			
 
				+            }
			
 
				+            cell_d["Glyphs_d"] = Glyphs_d
			
 
				+
			
 
				+        cell_d["pos"] = [float(pos_i) for pos_i in row['@Boundary'].split(" ")]  # 文本框
			
 
				+        if row.get('ofd:Clips', {}).get('ofd:Clip', {}).get('ofd:Area', {}).get('ofd:Path', {}):
			
 
				+            try:
			
 
				+                cell_d["clips_pos"] = [float(pos_i) for pos_i in
			
 
				+                                       row.get('ofd:Clips', {})
			
 
				+                                           .get('ofd:Clip', {})
			
 
				+                                           .get('ofd:Area', {})
			
 
				+                                           .get('ofd:Path', {})
			
 
				+                                           .get('@Boundary', "")
			
 
				+                                           .split(" ")]
			
 
				+            except:
			
 
				+                pass
			
 
				+        cell_d["text"] = str(TextObject.get('#text'))
			
 
				+        cell_d["font"] = row['@Font']  # 字体
			
 
				+        cell_d["size"] = float(row['@Size'])  # 字号
			
 
				+        # print("row", row)
			
 
				+
			
 
				+        color = self.ofd_param("ofd:FillColor", row).get("@Value", "0 0 0")
			
 
				+
			
 
				+        cell_d["color"] = tuple(color.split(" "))  # 颜色
			
 
				+        cell_d["DeltaY"] = TextObject.get("@DeltaY", "")  # y 轴偏移量 竖版文字表示方法之一
			
 
				+        cell_d["DeltaX"] = TextObject.get("@DeltaX", "")  # x 轴偏移量
			
 
				+        cell_d["CTM"] = row.get("@CTM", "")  # 平移矩阵换
			
 
				+
			
 
				+        cell_d["X"] = TextObject.get("@X", "")  # X 文本之与文本框距离
			
 
				+        cell_d["Y"] = TextObject.get("@Y", "")  # Y 文本之与文本框距离
			
 
				+        return cell_d
			
 
				+
			
 
				+    def __call__(self) -> list:
			
 
				+        """
			
 
				+
			
 
				+        输出主体坐标和文字信息 cell_list
			
 
				+        [{"pos":row['@Boundary'].split(" "),
			
 
				+                    "text":row['ofd:TextCode'].get('#text'),
			
 
				+                    "font":row['@Font'],
			
 
				+                    "size":row['@Size'],}]
			
 
				+        """
			
 
				+        text_list = []
			
 
				+        img_list = []
			
 
				+        line_list = []
			
 
				+
			
 
				+        content_d = {
			
 
				+            "text_list": text_list,
			
 
				+            "img_list": img_list,
			
 
				+            "line_list": line_list,
			
 
				+        }
			
 
				+
			
 
				+        text: list = []  # 正文
			
 
				+        text_key = "ofd:TextObject"
			
 
				+        self.recursion_ext(self.xml_obj, text, text_key)
			
 
				+
			
 
				+        if text:
			
 
				+            for row in text:
			
 
				+                # print("row", row.get('ofd:TextCode', {}))
			
 
				+                if isinstance(row.get('ofd:TextCode', {}), list):
			
 
				+                    for _i in row.get('ofd:TextCode', {}):
			
 
				+                        if not _i.get('#text'):
			
 
				+                            continue
			
 
				+                        cell_d = self.fetch_cell_info(row, _i)
			
 
				+                        text_list.append(cell_d)
			
 
				+
			
 
				+                elif isinstance(row.get('ofd:TextCode', {}), dict):
			
 
				+                    if not row.get('ofd:TextCode', {}).get('#text'):
			
 
				+                        continue
			
 
				+                    cell_d = self.fetch_cell_info(row, row.get('ofd:TextCode', {}))
			
 
				+                    text_list.append(cell_d)
			
 
				+
			
 
				+                else:
			
 
				+                    logger.error(f"'ofd:TextCode' format nonsupport  {row.get('ofd:TextCode', {})}")
			
 
				+                    continue
			
 
				+
			
 
				+        line: list = []  # 路径线条
			
 
				+        line_key = "ofd:PathObject"
			
 
				+        self.recursion_ext(self.xml_obj, line, line_key)
			
 
				+
			
 
				+        if line:
			
 
				+            # print(line)
			
 
				+            for _i in line:
			
 
				+                line_d = {}
			
 
				+                # print("line",_i)
			
 
				+                try:
			
 
				+                    line_d["ID"] = _i.get("@ID", "")  # 图片id
			
 
				+                    line_d["pos"] = [float(pos_i) for pos_i in _i['@Boundary'].split(" ")]  # 平移矩阵换
			
 
				+                    line_d["LineWidth"] = _i.get("@LineWidth", "")  # 图片id
			
 
				+                    line_d["AbbreviatedData"] = _i.get("ofd:AbbreviatedData", "")  # 路径指令
			
 
				+                    line_d["FillColor"] = self.ofd_param("ofd:FillColor", _i).get('@Value', "0 0 0").split(" ")  # 颜色
			
 
				+                    line_d["StrokeColor"] = self.ofd_param("ofd:StrokeColor", _i).get('@Value', "0 0 0")  # 颜色
			
 
				+                    line_d["CTM"] = _i.get("@CTM", "")  # 平移矩阵换
			
 
				+                except KeyError as e:
			
 
				+                    logger.error(f"{e} \n line is {_i} \n")
			
 
				+                    continue
			
 
				+                line_list.append(line_d)
			
 
				+
			
 
				+        img: list = []  # 图片
			
 
				+        img_key = "ofd:ImageObject"
			
 
				+        self.recursion_ext(self.xml_obj, img, img_key)
			
 
				+
			
 
				+        if img:
			
 
				+            for _i in img:
			
 
				+                img_d = {}
			
 
				+                img_d["CTM"] = _i.get("@CTM", "")  # 平移矩阵换
			
 
				+                img_d["ID"] = _i.get("ID", "")  # 图片id
			
 
				+                img_d["ResourceID"] = _i.get("@ResourceID", "")  # 图片id
			
 
				+                img_d["pos"] = [float(pos_i) for pos_i in _i['@Boundary'].split(" ")]  # 平移矩阵换
			
 
				+                img_list.append(img_d)
			
 
				+
			
 
				+        return content_d
			
 
				+
			
--- a/format_convert/easyofd/easyofd/parser_ofd/file_customtag_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_customtag_parser.py
@@ -0,0 +1,7 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME:  file_customtag_parser.py
			
 
				+# CREATE_TIME: 2025/4/9 18:51
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE:
			
--- a/format_convert/easyofd/easyofd/parser_ofd/file_deal.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_deal.py
@@ -0,0 +1,104 @@
 
				+# coding: utf-8
			
 
				+#!/usr/bin/env python
			
 
				+#-*- coding: utf-8 -*-
			
 
				+#PROJECT_NAME: D:\code\easyofd\easyofd\parser
			
 
				+#CREATE_TIME: 2023-07-27 
			
 
				+#E_MAIL: renoyuan@foxmail.com
			
 
				+#AUTHOR: reno 
			
 
				+#NOTE:  文件处理
			
 
				+import os
			
 
				+import base64
			
 
				+import shutil
			
 
				+from typing import Any
			
 
				+from uuid import uuid1
			
 
				+
			
 
				+import xmltodict
			
 
				+import zipfile
			
 
				+from loguru import logger
			
 
				+
			
 
				+from .path_parser import PathParser
			
 
				+
			
 
				+
			
 
				+class FileRead(object):
			
 
				+    """
			
 
				+    文件读取，清除
			
 
				+    'root': OFD.xml 
			
 
				+    "root_doc" Doc_0/Document.xml
			
 
				+    xml_path : xml_obj
			
 
				+    other_path : b64string
			
 
				+    """
			
 
				+    def __init__(self, ofdb64:str):
			
 
				+
			
 
				+        self.ofdbyte = base64.b64decode(ofdb64) 
			
 
				+        pid=os.getpid()
			
 
				+        self.name = f"{pid}_{str(uuid1())}.ofd"
			
 
				+        self.pdf_name = self.name.replace(".ofd",".pdf")
			
 
				+        self.zip_path = f"{os.getcwd()}/{self.name}"
			
 
				+        self.unzip_path = ""
			
 
				+        self.file_tree = {}
			
 
				+    
			
 
				+    def unzip_file(self, unzip_dir=None):
			
 
				+        """
			
 
				+        :param zip_path: ofd格式文件路径
			
 
				+        :param unzip_path: 解压后的文件存放目录
			
 
				+        :return: unzip_path
			
 
				+        """
			
 
				+        if unzip_dir is None:
			
 
				+            self.unzip_path = self.zip_path.split('.')[0]
			
 
				+            self.zip_path = f"{os.getcwd()}/{self.name}"
			
 
				+        else:
			
 
				+            self.unzip_path = unzip_dir
			
 
				+            self.zip_path = f"{unzip_dir}{self.name}"
			
 
				+        print('ofd self.unzip_path', self.unzip_path)
			
 
				+        print('ofd self.zip_path', self.zip_path)
			
 
				+
			
 
				+        with open(self.zip_path,"wb") as f:
			
 
				+            f.write(self.ofdbyte)
			
 
				+
			
 
				+        with zipfile.ZipFile(self.zip_path, 'r') as f:
			
 
				+            for file in f.namelist():
			
 
				+                # print('file', file)
			
 
				+                # 跳过附件，在显示中不展示
			
 
				+                if 'Attachs' in file:
			
 
				+                    continue
			
 
				+                f.extract(file, path=self.unzip_path)
			
 
				+        if self.save_xml:
			
 
				+            print("saving xml {}".format(self.xml_name))
			
 
				+            with zipfile.ZipFile(self.zip_path, 'r') as f:
			
 
				+                for file in f.namelist():
			
 
				+                    f.extract(file, path=self.xml_name)
			
 
				+       
			
 
				+    def buld_file_tree(self):
			
 
				+        "xml读取对象其他b64"
			
 
				+        self.file_tree["root"] = self.unzip_path
			
 
				+        self.file_tree["pdf_name"] = self.pdf_name
			
 
				+        for root, dirs, files in os.walk(self.unzip_path):
			
 
				+            for file in files:
			
 
				+                
			
 
				+                abs_path = os.path.join(root,file)
			
 
				+                # 资源文件 则 b64 xml 则  xml——obj
			
 
				+                self.file_tree[abs_path] = str(base64.b64encode(open(f"{abs_path}","rb").read()),"utf-8")  \
			
 
				+                    if "xml" not in file else xmltodict.parse(open(f"{abs_path}" , "r", encoding="utf-8").read())
			
 
				+        self.file_tree["root_doc"] = os.path.join(self.unzip_path,"OFD.xml") if os.path.join(self.unzip_path,"OFD.xml") in self.file_tree else ""
			
 
				+  
			
 
				+        # if os.path.exists(self.unzip_path):
			
 
				+        #     shutil.rmtree(self.unzip_path)
			
 
				+       
			
 
				+        # if os.path.exists(self.zip_path):
			
 
				+        #     os.remove(self.zip_path)
			
 
				+                   
			
 
				+    def __call__(self, *args: Any, **kwds: Any) -> Any:
			
 
				+        self.save_xml=kwds.get("save_xml",False)
			
 
				+        self.xml_name=kwds.get("xml_name")
			
 
				+        self.save_dir = kwds.get('save_dir')
			
 
				+    
			
 
				+        self.unzip_file(self.save_dir)
			
 
				+        self.buld_file_tree()
			
 
				+        return self.file_tree 
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    with open(r"D:/code/easyofd/test/增值税电子专票5.ofd","rb") as f:
			
 
				+        ofdb64 = str(base64.b64encode(f.read()),"utf-8")
			
 
				+    a = FileRead(ofdb64)()
			
 
				+    print(list(a.keys()))
			
--- a/format_convert/easyofd/easyofd/parser_ofd/file_doc_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_doc_parser.py
@@ -0,0 +1,99 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME:  file_doc_parser.py
			
 
				+# CREATE_TIME: 2025/3/28 11:46
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE: 解析document
			
 
				+
			
 
				+import  re
			
 
				+
			
 
				+from .file_parser_base import FileParserBase
			
 
				+
			
 
				+
			
 
				+
			
 
				+class DocumentFileParser(FileParserBase):
			
 
				+    """
			
 
				+    Document 为doc内的根节点 包含：
			
 
				+    1 文件的路径 2 doc的size
			
 
				+
			
 
				+    /xml_dir/Doc_0/Document.xml
			
 
				+    """
			
 
				+
			
 
				+    def loc2page_no(self, loc, idx):
			
 
				+        pg_no = re.search(r"\d+", loc)
			
 
				+        if pg_no:
			
 
				+            pg_no = int(pg_no.group())
			
 
				+        else:
			
 
				+            pg_no = idx
			
 
				+        return pg_no
			
 
				+
			
 
				+    def __call__(self):
			
 
				+        document_info = {}
			
 
				+
			
 
				+        # size
			
 
				+        physical_box: list = []
			
 
				+        physical_box_key = "ofd:PhysicalBox"
			
 
				+        self.recursion_ext(self.xml_obj, physical_box, physical_box_key)
			
 
				+        document_info["size"] = physical_box[0] if physical_box else ""
			
 
				+
			
 
				+        # ofd:PublicRes路径 包含字体路径信息
			
 
				+        public_res: list = []
			
 
				+        public_res_key = "ofd:PublicRes"
			
 
				+        self.recursion_ext(self.xml_obj, public_res, public_res_key)
			
 
				+        document_info["public_res"] = public_res
			
 
				+
			
 
				+        # ofd:DocumentRes路径  包含静态资源图片
			
 
				+        document_res: list = []
			
 
				+        document_res_key = "ofd:DocumentRes"
			
 
				+        self.recursion_ext(self.xml_obj, document_res, document_res_key)
			
 
				+        document_info["document_res"] = document_res
			
 
				+
			
 
				+        # tpls
			
 
				+        tpls: list = []
			
 
				+        template_page_key = "ofd:TemplatePage"
			
 
				+        self.recursion_ext(self.xml_obj, tpls, template_page_key)
			
 
				+        if tpls:
			
 
				+            tpls = [i.get("@BaseLoc") if isinstance(i, dict) else i for i in tpls]
			
 
				+        document_info["tpls"] = tpls
			
 
				+
			
 
				+        # ofd:Page 正文
			
 
				+        page: list = []
			
 
				+        page_id_map = {}
			
 
				+        page_key = "ofd:Page"
			
 
				+        self.recursion_ext(self.xml_obj, page, page_key)
			
 
				+        if page:
			
 
				+            page_id_map = {
			
 
				+                i.get("@ID"): self.loc2page_no(i.get("@BaseLoc"), idx)
			
 
				+                for idx, i in enumerate(page)
			
 
				+            }
			
 
				+            page = [i.get("@BaseLoc") if isinstance(i, dict) else i for i in page]
			
 
				+
			
 
				+        document_info["page"] = page
			
 
				+        document_info["page_id_map"] = page_id_map
			
 
				+
			
 
				+        # ofd:Annotations
			
 
				+        annotations: list = []
			
 
				+        annotations_key = "ofd:Annotations"
			
 
				+        self.recursion_ext(self.xml_obj, annotations, annotations_key)
			
 
				+        document_info["Annotations"] = annotations
			
 
				+
			
 
				+        # ofd:Attachments
			
 
				+        attachments: list = []
			
 
				+        attachments_key = "ofd:Attachments"
			
 
				+        self.recursion_ext(self.xml_obj, attachments, attachments_key)
			
 
				+        document_info["attachments"] = attachments
			
 
				+
			
 
				+        # ofd:CustomTags
			
 
				+        custom_tag: list = []
			
 
				+        custom_tag_key = "ofd:CustomTags"
			
 
				+        self.recursion_ext(self.xml_obj, custom_tag, custom_tag_key)
			
 
				+        document_info["custom_tag"] = custom_tag
			
 
				+
			
 
				+        return document_info
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/format_convert/easyofd/easyofd/parser_ofd/file_docres_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_docres_parser.py
@@ -0,0 +1,36 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME:  file_docres_parser.py
			
 
				+# CREATE_TIME: 2025/3/28 11:48
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE: 解析 DocumentRes
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+from .file_parser_base import FileParserBase
			
 
				+
			
 
				+class DocumentResFileParser(FileParserBase):
			
 
				+    """
			
 
				+    Parser DocumentRes 抽取里面图片信息
			
 
				+    /xml_dir/Doc_0/DocumentRes.xml
			
 
				+    /xml_dir/Doc_0/PublicRes.xml
			
 
				+    """
			
 
				+
			
 
				+    def __call__(self):
			
 
				+        info = {}
			
 
				+        muti_media: list = []
			
 
				+        muti_media_key = "ofd:MultiMedia"
			
 
				+        self.recursion_ext(self.xml_obj, muti_media, muti_media_key)
			
 
				+        if muti_media:
			
 
				+            for media in muti_media:
			
 
				+                name = media.get("ofd:MediaFile", "")
			
 
				+                info[media.get("@ID")] = {
			
 
				+                    "format": media.get("@Format", ""),
			
 
				+                    "wrap_pos": media.get("@wrap_pos", ""),
			
 
				+                    # "Boundary": media.get("@Boundary", ""),
			
 
				+                    "type": media.get("@Type", ""),
			
 
				+                    "suffix": os.path.splitext(name)[-1].replace(".", ""),  # 文件后缀名
			
 
				+                    "fileName": name,
			
 
				+                }
			
 
				+        return info
			
--- a/format_convert/easyofd/easyofd/parser_ofd/file_ofd_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_ofd_parser.py
@@ -0,0 +1,41 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME:  file_ofd_parser.py
			
 
				+# CREATE_TIME: 2025/3/28 11:45
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE: 解析OFD
			
 
				+from .file_parser_base import FileParserBase
			
 
				+
			
 
				+class OFDFileParser(FileParserBase):
			
 
				+    """
			
 
				+    Parser OFD 文件
			
 
				+    /xml_dir/OFD.xml
			
 
				+    """
			
 
				+    def __call__(self):
			
 
				+        info = {}
			
 
				+        # DocRoot
			
 
				+        doc_root: list = []
			
 
				+        doc_root_key = "ofd:DocRoot"
			
 
				+        # print(self.xml_obj,doc_root)
			
 
				+        self.recursion_ext(self.xml_obj, doc_root, doc_root_key)
			
 
				+        info["doc_root"] = doc_root
			
 
				+
			
 
				+        signatures: list = []
			
 
				+        signatures_key = "ofd:Signatures"
			
 
				+        self.recursion_ext(self.xml_obj, signatures, signatures_key)
			
 
				+        info["signatures"] = signatures
			
 
				+
			
 
				+        # ofd:Creator
			
 
				+        creator: list = []
			
 
				+        creator_key = "ofd:Creator"
			
 
				+        self.recursion_ext(self.xml_obj, creator, creator_key)
			
 
				+        info["creator"] = creator
			
 
				+
			
 
				+        # ofd:CreationDate
			
 
				+        reation_date: list = []
			
 
				+        creation_date_key = "ofd:CreationDate"
			
 
				+        self.recursion_ext(self.xml_obj, reation_date, creation_date_key)
			
 
				+        info["creationDate"] = reation_date
			
 
				+
			
 
				+        return info
			
--- a/format_convert/easyofd/easyofd/parser_ofd/file_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_parser.py
@@ -0,0 +1,58 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME: D:\code\easyofd\easyofd\parser
			
 
				+# CREATE_TIME: 2023-07-27
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE: 每种类型的文件定义一个解析器
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+sys.path.insert(0, "..")
			
 
				+import logging
			
 
				+import os
			
 
				+import traceback
			
 
				+import base64
			
 
				+import re
			
 
				+from typing import Any
			
 
				+from .parameter_parser import ParameterParser
			
 
				+logger = logging.getLogger("root")
			
 
				+
			
 
				+
			
 
				+class FileParserBase(object):
			
 
				+    """xml解析"""
			
 
				+
			
 
				+    def __init__(self, xml_obj):
			
 
				+        assert xml_obj
			
 
				+        self.ofd_param = ParameterParser()
			
 
				+        self.xml_obj = xml_obj
			
 
				+        # print(xml_obj)
			
 
				+
			
 
				+    def recursion_ext(self, need_ext_obj, ext_list, key):
			
 
				+        """
			
 
				+        抽取需要xml要素
			
 
				+        need_ext_obj : xmltree
			
 
				+        ext_list: data container
			
 
				+        key: key
			
 
				+        """
			
 
				+        if isinstance(need_ext_obj, dict):
			
 
				+            for k, v in need_ext_obj.items():
			
 
				+                if k == key:
			
 
				+                    if isinstance(v, (dict, str)):
			
 
				+                        ext_list.append(v)
			
 
				+                    elif isinstance(v, list):
			
 
				+                        ext_list.extend(v)
			
 
				+                else:
			
 
				+                    if isinstance(v, dict):
			
 
				+                        self.recursion_ext(v, ext_list, key)
			
 
				+                    elif isinstance(v, list):
			
 
				+                        for cell in v:
			
 
				+                            self.recursion_ext(cell, ext_list, key)
			
 
				+                    else:
			
 
				+                        pass
			
 
				+        else:
			
 
				+            print(type(need_ext_obj))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    FileParserBase("")()
			
--- a/format_convert/easyofd/easyofd/parser_ofd/file_parser_base.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_parser_base.py
@@ -0,0 +1,63 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME:  file_parser_base.py
			
 
				+# CREATE_TIME: 2025/3/28 11:43
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE: base 解析器
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+sys.path.insert(0, "..")
			
 
				+import logging
			
 
				+import os
			
 
				+import traceback
			
 
				+import base64
			
 
				+import re
			
 
				+from typing import Any
			
 
				+from .parameter_parser import ParameterParser
			
 
				+logger = logging.getLogger("root")
			
 
				+
			
 
				+
			
 
				+class FileParserBase(object):
			
 
				+    """xml解析"""
			
 
				+
			
 
				+    def __init__(self, xml_obj):
			
 
				+        assert xml_obj
			
 
				+        self.ofd_param = ParameterParser()
			
 
				+        self.xml_obj = xml_obj
			
 
				+        # print(xml_obj)
			
 
				+
			
 
				+    def recursion_ext(self, need_ext_obj, ext_list, key):
			
 
				+        """
			
 
				+        抽取需要xml要素
			
 
				+        need_ext_obj : xmltree
			
 
				+        ext_list: data container
			
 
				+        key: key
			
 
				+        """
			
 
				+
			
 
				+        if isinstance(need_ext_obj, dict):
			
 
				+
			
 
				+            for k, v in need_ext_obj.items():
			
 
				+                if k == key:
			
 
				+
			
 
				+                    if isinstance(v, (dict, str)):
			
 
				+                        ext_list.append(v)
			
 
				+                    elif isinstance(v, list):
			
 
				+                        ext_list.extend(v)
			
 
				+
			
 
				+
			
 
				+                else:
			
 
				+
			
 
				+                    if isinstance(v, dict):
			
 
				+                        self.recursion_ext(v, ext_list, key)
			
 
				+                    elif isinstance(v, list):
			
 
				+                        for cell in v:
			
 
				+                            self.recursion_ext(cell, ext_list, key)
			
 
				+                    else:
			
 
				+
			
 
				+                        pass
			
 
				+        else:
			
 
				+
			
 
				+            print(type(need_ext_obj))
			
 
				+
			
--- a/format_convert/easyofd/easyofd/parser_ofd/file_publicres_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_publicres_parser.py
@@ -0,0 +1,52 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME:  file_publicres_parser.py
			
 
				+# CREATE_TIME: 2025/3/28 11:49
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE: PublicResFileParser
			
 
				+
			
 
				+from .file_parser_base import FileParserBase
			
 
				+
			
 
				+
			
 
				+class PublicResFileParser(FileParserBase):
			
 
				+    """
			
 
				+    Parser PublicRes 抽取里面 获取公共信息 字体信息
			
 
				+    /xml_dir/Doc_0/PublicRes.xml
			
 
				+    """
			
 
				+
			
 
				+    def normalize_font_name(self, font_name):
			
 
				+        """将字体名称规范化，例如 'Times New Roman Bold' -> 'TimesNewRoman-Bold'"""
			
 
				+        # 替换空格为无，并将样式（Bold/Italic等）用连字符连接
			
 
				+        if not isinstance(font_name, str):
			
 
				+            return ""
			
 
				+        normalized = font_name.replace(' ', '')
			
 
				+        # 处理常见的样式后缀
			
 
				+        for style in ['Bold', 'Italic', 'Regular', 'Light', 'Medium', ]:
			
 
				+            if style in normalized:
			
 
				+                normalized = normalized.replace(style, f'-{style}')
			
 
				+
			
 
				+        # todo 特殊字体名规范 后续存在需要完善
			
 
				+        if normalized == "TimesNewRoman":
			
 
				+            normalized = normalized.replace("TimesNewRoman", "Times-Roman")
			
 
				+        return normalized
			
 
				+
			
 
				+    def __call__(self):
			
 
				+        info = {}
			
 
				+        public_res: list = []
			
 
				+        public_res_key = "ofd:Font"
			
 
				+        self.recursion_ext(self.xml_obj, public_res, public_res_key)
			
 
				+
			
 
				+        if public_res:
			
 
				+            for i in public_res:
			
 
				+                info[i.get("@ID")] = {
			
 
				+                    "FontName": self.normalize_font_name(i.get("@FontName")),
			
 
				+                    "FontNameORI": i.get("@FontName"),
			
 
				+                    "FamilyName": self.normalize_font_name(i.get("@FamilyName")),
			
 
				+                    "FamilyNameORI": i.get("@FamilyName"),
			
 
				+                    "Bold": i.get("@Bold"),
			
 
				+                    "Serif": i.get("@Serif"),
			
 
				+                    "FixedWidth": i.get("@FixedWidth"),
			
 
				+                    "FontFile": i.get("ofd:FontFile"),
			
 
				+                }
			
 
				+        return info
			
--- a/format_convert/easyofd/easyofd/parser_ofd/file_signature_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_signature_parser.py
@@ -0,0 +1,63 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME:  file_signature_parser.py
			
 
				+# CREATE_TIME: 2025/3/28 14:13
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE: 签章解析
			
 
				+
			
 
				+from .file_parser_base import FileParserBase
			
 
				+
			
 
				+class SignaturesFileParser(FileParserBase):
			
 
				+    """
			
 
				+    Parser Signatures
			
 
				+    签章信息-总
			
 
				+    /xml_dir/Doc_0/PublicRes.xml
			
 
				+    """
			
 
				+
			
 
				+    def __call__(self):
			
 
				+        info = {}
			
 
				+        signature_res: list = []
			
 
				+        signature_res_key = "ofd:Signature"
			
 
				+        self.recursion_ext(self.xml_obj, signature_res, signature_res_key)
			
 
				+
			
 
				+        if signature_res:
			
 
				+            for i in signature_res:
			
 
				+                info[i.get("@ID")] = {
			
 
				+                    "BaseLoc": i.get("@BaseLoc"),
			
 
				+                    "Type": i.get("@Type"),
			
 
				+                    "ID": i.get("@ID"),
			
 
				+
			
 
				+                }
			
 
				+        return info
			
 
				+
			
 
				+
			
 
				+class SignatureFileParser(FileParserBase):
			
 
				+    """
			
 
				+    Parser Signature
			
 
				+    签章信息
			
 
				+    """
			
 
				+
			
 
				+    def __call__(self, prefix=""):
			
 
				+        info = {}
			
 
				+        StampAnnot_res: list = []
			
 
				+        StampAnnot_res_key = "ofd:StampAnnot"
			
 
				+
			
 
				+        self.recursion_ext(self.xml_obj, StampAnnot_res, StampAnnot_res_key)
			
 
				+
			
 
				+        SignedValue_res: list = []
			
 
				+        SignedValue_res_key = "ofd:SignedValue"
			
 
				+        self.recursion_ext(self.xml_obj, SignedValue_res, SignedValue_res_key)
			
 
				+
			
 
				+        # print("SignedValue_res", SignedValue_res)
			
 
				+        # print("prefix", prefix)
			
 
				+        if StampAnnot_res:
			
 
				+            for i in StampAnnot_res:
			
 
				+                info = {
			
 
				+                    "PageRef": i.get("@PageRef"),  # page id
			
 
				+                    "Boundary": i.get("@Boundary"),
			
 
				+                    "ID": i.get("@ID"),
			
 
				+                    "SignedValue": f"{prefix}/{SignedValue_res[0]}" if SignedValue_res else f"{prefix}/SignedValue.dat",
			
 
				+                }
			
 
				+
			
 
				+        return info
			
--- a/format_convert/easyofd/easyofd/parser_ofd/find_seal_img.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/find_seal_img.py
@@ -0,0 +1,100 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME: easyofd read_seal_img
			
 
				+# CREATE_TIME: 2024/5/28 14:13
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: renoyuan
			
 
				+# note: 根据 ASN.1 解析签章 拿到 签章图片
			
 
				+import io
			
 
				+
			
 
				+from PIL import Image, UnidentifiedImageError
			
 
				+from loguru import logger
			
 
				+from pyasn1.codec.der.decoder import decode
			
 
				+from pyasn1.type import univ
			
 
				+from pyasn1.error import PyAsn1Error
			
 
				+
			
 
				+
			
 
				+
			
 
				+class SealExtract(object):
			
 
				+    def __init__(self,):
			
 
				+        pass
			
 
				+    def read_signed_value(self, path):
			
 
				+        # 读取二进制文件
			
 
				+        with open(path, 'rb') as file:
			
 
				+            binary_data = file.read()
			
 
				+        # 尝试解码为通用的 ASN.1 结构
			
 
				+        try:
			
 
				+            decoded_data, _ = decode(binary_data)
			
 
				+        except PyAsn1Error as e:
			
 
				+            # print(f"Decoding failed: {e}")
			
 
				+            decoded_data = None
			
 
				+        finally:
			
 
				+           return  decoded_data
			
 
				+
			
 
				+
			
 
				+    def find_octet_strings(self, asn1_data,octet_strings:list):
			
 
				+
			
 
				+        # 递归查找所有的 OctetString 实例
			
 
				+
			
 
				+        if isinstance(asn1_data, univ.OctetString):
			
 
				+
			
 
				+            octet_strings.append(asn1_data)
			
 
				+        elif isinstance(asn1_data, univ.Sequence) or isinstance(asn1_data, univ.Set):
			
 
				+            for component in asn1_data:
			
 
				+                self.find_octet_strings(asn1_data[f"{component}"], octet_strings)
			
 
				+        elif isinstance(asn1_data, univ.Choice):
			
 
				+            self.find_octet_strings(asn1_data.getComponent(), octet_strings)
			
 
				+        elif isinstance(asn1_data, univ.Any):
			
 
				+            try:
			
 
				+                sub_data, _ = decode(asn1_data.asOctets())
			
 
				+                self.find_octet_strings(sub_data, octet_strings)
			
 
				+            except PyAsn1Error:
			
 
				+                pass
			
 
				+
			
 
				+
			
 
				+    def hex_to_image(self, hex_data, image_format='PNG',inx=0):
			
 
				+        """
			
 
				+        将16进制数据转换为图片并保存。
			
 
				+
			
 
				+        :param hex_data: 图片的16进制数据字符串
			
 
				+        :param image_format: 图片的格式，默认为'PNG'
			
 
				+        """
			
 
				+        # 将16进制数据转换为二进制数据
			
 
				+
			
 
				+        binary_data = bytes.fromhex(hex_data)
			
 
				+
			
 
				+        # 创建BytesIO对象以读取二进制数据
			
 
				+        image_stream = io.BytesIO(binary_data)
			
 
				+
			
 
				+        # 使用Pillow打开图像数据并保存
			
 
				+        try:
			
 
				+            image = Image.open(image_stream)
			
 
				+            # image.save(f'{inx}_image.{image_format}', format=image_format)
			
 
				+            # print(f"图片已保存为'image.{image_format}'")
			
 
				+            return image
			
 
				+        except UnidentifiedImageError:
			
 
				+            pass
			
 
				+            # logger.info("not img ")
			
 
				+
			
 
				+    def __call__(self, path):
			
 
				+        decoded_data = self.read_signed_value(path)
			
 
				+        octet_strings = []
			
 
				+        img_list = []  # 目前是只有一个的，若存在多个的话关联后面考虑
			
 
				+        if decoded_data:
			
 
				+            self.find_octet_strings(decoded_data, octet_strings)
			
 
				+
			
 
				+            for i, octet_string in enumerate(octet_strings):
			
 
				+
			
 
				+                if str(octet_string.prettyPrint()).startswith("0x"):
			
 
				+
			
 
				+                    img = self.hex_to_image(str(octet_string.prettyPrint())[2:],inx= i)
			
 
				+                    if img:
			
 
				+                        img_list.append(img)
			
 
				+        else:
			
 
				+            pass
			
 
				+            # logger.info("No valid ASN.1 data found.")
			
 
				+        return  img_list
			
 
				+
			
 
				+if __name__=="__main__":
			
 
				+    print(SealExtract()(r"F:\code\easyofd\test\1111_xml\Doc_0\Signs\Sign_0\SignedValue.dat" ))
			
 
				+
			
--- a/format_convert/easyofd/easyofd/parser_ofd/img_deal.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/img_deal.py
@@ -0,0 +1,35 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME: easyofd img_deal
			
 
				+# CREATE_TIME: 2024/7/18 11:20
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: renoyuan
			
 
				+# note: img 操作
			
 
				+from io import BytesIO
			
 
				+class DealImg(object):
			
 
				+    def __init__(self):
			
 
				+        pass
			
 
				+    def resize(self):
			
 
				+        """resize img"""
			
 
				+        pass
			
 
				+    def pil2bytes(self, image):
			
 
				+        """pil2bytes"""
			
 
				+        # 创建一个 BytesIO 对象
			
 
				+        img_bytesio = BytesIO()
			
 
				+        # 将图像保存到 BytesIO 对象
			
 
				+        image.save(img_bytesio, format='PNG')  # 你可以根据需要选择其他图像格式
			
 
				+        # 获取 BytesIO 对象中的字节
			
 
				+        img_bytes = img_bytesio.getvalue()
			
 
				+        # 关闭 BytesIO 对象
			
 
				+        img_bytesio.close()
			
 
				+        return img_bytes
			
 
				+    def pil2bytes_io(self, image):
			
 
				+        """pil2bytes_io"""
			
 
				+        # 创建一个 BytesIO 对象
			
 
				+        img_bytesio = BytesIO()
			
 
				+        # 将图像保存到 BytesIO 对象
			
 
				+        image.save(img_bytesio, format='PNG')  # 你可以根据需要选择其他图像格式
			
 
				+        return img_bytesio
			
 
				+
			
 
				+
			
 
				+
			
--- a/format_convert/easyofd/easyofd/parser_ofd/ofd_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/ofd_parser.py
@@ -0,0 +1,607 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME: D:\code\easyofd\easyofd\parser
			
 
				+# CREATE_TIME: 2023-07-27
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE: ofd解析主流程
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+sys.path.append(os.path.dirname(__file__) + "/../../../../")
			
 
				+from format_convert.easyofd.easyofd.parser_ofd.file_ofd_parser import OFDFileParser
			
 
				+from jbig2_parser import jbig2_parser
			
 
				+import traceback
			
 
				+import base64
			
 
				+import re
			
 
				+import io
			
 
				+# import jbigkit
			
 
				+from typing import Any, List
			
 
				+from PIL import Image
			
 
				+from PIL.Image import Image as ImageClass
			
 
				+from loguru import logger
			
 
				+
			
 
				+from format_convert.easyofd.easyofd.parser_ofd.img_deal import DealImg
			
 
				+from format_convert.easyofd.easyofd.parser_ofd.file_deal import FileRead
			
 
				+from format_convert.easyofd.easyofd.parser_ofd.file_ofd_parser import OFDFileParser
			
 
				+from format_convert.easyofd.easyofd.parser_ofd.file_doc_parser import DocumentFileParser
			
 
				+from format_convert.easyofd.easyofd.parser_ofd.file_docres_parser import DocumentResFileParser
			
 
				+from format_convert.easyofd.easyofd.parser_ofd.file_content_parser import ContentFileParser
			
 
				+from format_convert.easyofd.easyofd.parser_ofd.file_annotation_parser import AnnotationFileParser
			
 
				+from format_convert.easyofd.easyofd.parser_ofd.file_publicres_parser import PublicResFileParser
			
 
				+from format_convert.easyofd.easyofd.parser_ofd.file_signature_parser import SignaturesFileParser,SignatureFileParser
			
 
				+from format_convert.easyofd.easyofd.parser_ofd.path_parser import PathParser
			
 
				+# todo 解析流程需要大改
			
 
				+
			
 
				+
			
 
				+class OFDParser(object):
			
 
				+    """
			
 
				+    OFDParser 解析
			
 
				+    1 解压文件 创建文件映射表 释放文件
			
 
				+    2 解析 xml 逐级去 收集需要信息  结构文本 以及 资源
			
 
				+    2 调用font 注册 字体
			
 
				+
			
 
				+    图层顺序 tlp>content>annotation
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, ofdb64):
			
 
				+        self.img_deal = DealImg()
			
 
				+        self.ofdb64 = ofdb64
			
 
				+        self.file_tree = None
			
 
				+        # self.jbig2dec_path = r"C:/msys64/mingw64/bin/jbig2dec.exe"
			
 
				+        self.jbig2dec_path = r'D:\Anaconda3\pkgs\jbig2dec-0.18-ha9979f8_0\Library\bin\jbig2dec.exe'
			
 
				+
			
 
				+    def img2data(self, imglist: List[ImageClass]):
			
 
				+        """
			
 
				+        imglist to ofd data
			
 
				+        
			
 
				+        """
			
 
				+        OP = 200 / 25.4
			
 
				+        doc_list = []
			
 
				+        img_info = {}
			
 
				+        page_size = []
			
 
				+        font_info = {}
			
 
				+        page_info_d = {}
			
 
				+
			
 
				+        for idx, img_pil in enumerate(imglist):
			
 
				+            w, h = img_pil.size
			
 
				+            img_bytes = self.img_deal.pil2bytes(img_pil)
			
 
				+            imgb64 = str(base64.b64encode(img_bytes), encoding="utf-8")
			
 
				+            img_info[str(idx)] = {
			
 
				+                "format": "jpg",
			
 
				+                "wrap_pos": "",
			
 
				+                "type": "IMG",
			
 
				+                "suffix": "jpg",
			
 
				+                "fileName": f"{idx}.jpg",
			
 
				+                "imgb64": imgb64,
			
 
				+
			
 
				+            }
			
 
				+            text_list = []
			
 
				+            img_list = []
			
 
				+            img_d = {}
			
 
				+            img_d["CTM"] = ""  # 平移矩阵换 平移 缩放 旋转
			
 
				+            img_d["ID"] = str(idx)  # 图片id
			
 
				+            img_d["ResourceID"] = str(idx)  # 图片id
			
 
				+            img_d["pos"] = [0, 0, w / OP, h / OP]  # 平移矩阵换
			
 
				+            page_size = [0, 0, w / OP, h / OP]
			
 
				+            # print(page_size)
			
 
				+            img_list.append(img_d)
			
 
				+
			
 
				+            content_d = {
			
 
				+                "text_list": text_list,
			
 
				+                "img_list": img_list,
			
 
				+            }
			
 
				+            page_info_d[idx] = content_d
			
 
				+        doc_list.append({
			
 
				+            "pdf_name": "demo.pdf",
			
 
				+            "doc_no": "0",
			
 
				+            "images": img_info,
			
 
				+            "page_size": page_size,
			
 
				+            "fonts": font_info,
			
 
				+            "page_info": page_info_d
			
 
				+        })
			
 
				+
			
 
				+        return doc_list
			
 
				+
			
 
				+    # 获得xml 对象
			
 
				+    def get_xml_obj(self, label):
			
 
				+        assert label
			
 
				+        # print(self.file_tree.keys())
			
 
				+        label =label.lstrip('./')
			
 
				+        for abs_p in self.file_tree:
			
 
				+            # 统一符号，避免win linux 路径冲突
			
 
				+
			
 
				+            abs_p_compare = abs_p.replace("\\\\", "-").replace("//", "-").replace("\\", "-").replace("/", "-")
			
 
				+            label_compare = label.replace("\\\\", "-").replace("//", "-").replace("\\", "-").replace("/", "-")
			
 
				+            if label_compare in abs_p_compare:
			
 
				+                # logger.info(f"{label} {abs_p}")
			
 
				+                return self.file_tree[abs_p]
			
 
				+        # logger.info(f"{label} ofd file path is not")
			
 
				+        return ""
			
 
				+
			
 
				+    def jb22png_old(self, img_d: dict):
			
 
				+        """
			
 
				+        jb22png
			
 
				+        没有安装 jbig2dec 无法操作 
			
 
				+        """
			
 
				+        if not os.path.exists(self.jbig2dec_path):
			
 
				+            logger.warning(f"未安装jbig2dec，无法处理jb2文件")
			
 
				+            return
			
 
				+
			
 
				+        # todo ib2 转png C:/msys64/mingw64/bin/jbig2dec.exe -o F:\code\easyofd\test\image_80.png F:\code\easyofd\test\image_80.jb2
			
 
				+        fileName = img_d["fileName"]
			
 
				+        print('jb2 file_name', fileName)
			
 
				+        new_fileName = img_d['fileName'].replace(".jb2", ".png")
			
 
				+        with open(fileName, "wb") as f:
			
 
				+            f.write(base64.b64decode(img_d["imgb64"]))
			
 
				+        command = "{} -o {} {}"
			
 
				+        res = os.system(command.format(self.jbig2dec_path, new_fileName, fileName))
			
 
				+        if res != 0:
			
 
				+            pass
			
 
				+            # logger.warning(f"jbig2dec处理失败")
			
 
				+        # if os.path.exists(fileName):
			
 
				+        #     os.remove(fileName)
			
 
				+        if os.path.exists(new_fileName):
			
 
				+            # logger.info(f"jbig2dec处理成功{fileName}>>{new_fileName}")
			
 
				+            img_d["fileName"] = new_fileName
			
 
				+            img_d["suffix"] = "png"
			
 
				+            img_d["format"] = "png"
			
 
				+            with open(new_fileName, "rb") as f:
			
 
				+                data = f.read()
			
 
				+                img_d["imgb64"] = str(base64.b64encode(data), encoding="utf-8")
			
 
				+
			
 
				+            # os.remove(new_fileName)
			
 
				+
			
 
				+    def jb22png(self, img_d: dict):
			
 
				+        """
			
 
				+        jb22png
			
 
				+        没有安装 jbig2dec 无法操作
			
 
				+        """
			
 
				+
			
 
				+        file_name = img_d["fileName"]
			
 
				+        # print('jb2 file_name', file_name)
			
 
				+        new_file_name = img_d['fileName'].replace(".jb2", ".png")
			
 
				+        with open(file_name, "rb") as f:
			
 
				+            data = f.read()
			
 
				+        png_data = jbig2_parser.parse_jbig2(data)
			
 
				+        png_bytes = bytes(png_data)
			
 
				+        # print('png_data', png_data)
			
 
				+
			
 
				+        # # 将字节缓冲区转换为图像对象
			
 
				+        # image = Image.open(io.BytesIO(png_data))
			
 
				+        #
			
 
				+        # # 保存图像为 PNG 文件
			
 
				+        # image.save(new_file_name, 'PNG')
			
 
				+
			
 
				+        with open(new_file_name, 'wb') as f:
			
 
				+            f.write(png_bytes)
			
 
				+
			
 
				+        if os.path.exists(new_file_name):
			
 
				+            # logger.info(f"jbig2dec处理成功{fileName}>>{new_fileName}")
			
 
				+            img_d["fileName"] = new_file_name
			
 
				+            img_d["suffix"] = "png"
			
 
				+            img_d["format"] = "png"
			
 
				+            with open(new_file_name, "rb") as f:
			
 
				+                data = f.read()
			
 
				+                img_d["imgb64"] = str(base64.b64encode(data), encoding="utf-8")
			
 
				+
			
 
				+        # decoder = jbigkit.JbgDecoder()
			
 
				+        # with open(file_name, "rb") as f:
			
 
				+        #     data = f.read()
			
 
				+        # status, processed_len = decoder.decode_in(data)
			
 
				+        # if status != jbigkit.JbgErrno.EOK or processed_len != len(data):
			
 
				+        #     print('jb2 file error!')
			
 
				+        #     return
			
 
				+        # assert status == jbigkit.JbgErrno.EOK
			
 
				+        # assert processed_len == len(data)
			
 
				+        #
			
 
				+        # w, h = decoder.get_width(), decoder.get_height()
			
 
				+        #
			
 
				+        # ith_plane = decoder.get_plane(0)  # 获取第一个平面
			
 
				+        # img = Image.frombytes('1', (w, h), bytes(ith_plane), 'raw', '1;I')
			
 
				+        # img.save(new_file_name)
			
 
				+
			
 
				+        # os.remove(new_fileName)
			
 
				+
			
 
				+    def bmp2jpg(self, img_d: dict):
			
 
				+
			
 
				+        fileName = img_d["fileName"]
			
 
				+        new_fileName = img_d['fileName'].replace(".bmp", ".jpg")
			
 
				+        b64_nmp = self.get_xml_obj(fileName)
			
 
				+        image_data = base64.b64decode(b64_nmp)
			
 
				+        image = Image.open(io.BytesIO(image_data))
			
 
				+        rgb_image = image.convert("RGB")
			
 
				+        output_buffer = io.BytesIO()
			
 
				+        rgb_image.save(output_buffer, format="JPEG")
			
 
				+        image.close()
			
 
				+        jpeg_bytes = output_buffer.getvalue()
			
 
				+        b64_jpeg = base64.b64encode(jpeg_bytes).decode('utf-8')
			
 
				+        output_buffer.close()
			
 
				+
			
 
				+        if b64_jpeg:
			
 
				+            logger.info(f"bmp2jpg处理成功{fileName}>>{new_fileName}")
			
 
				+            img_d["fileName"] = new_fileName
			
 
				+            img_d["suffix"] = "jpg"
			
 
				+            img_d["format"] = "jpg"
			
 
				+            img_d["imgb64"] = b64_jpeg
			
 
				+
			
 
				+    def tif2jpg(self, img_d: dict):
			
 
				+        fileName = img_d["fileName"]
			
 
				+        new_fileName = img_d['fileName'].replace(".tif", ".jpg")
			
 
				+        tif_nmp = self.get_xml_obj(fileName)
			
 
				+        image_data = base64.b64decode(tif_nmp)
			
 
				+        image = Image.open(io.BytesIO(image_data))
			
 
				+        if image.mode in ("RGBA", "LA") or (image.mode == "P" and "transparency" in image.info):
			
 
				+            image = image.convert("RGB")
			
 
				+
			
 
				+            # 创建一个字节流来保存处理后的图像
			
 
				+        output_buffer = io.BytesIO()
			
 
				+
			
 
				+        # 保存图像为 JPEG 格式到字节流中
			
 
				+        image.save(output_buffer, format="JPEG", quality=95)
			
 
				+
			
 
				+        # 获取字节流中的内容并编码为 Base64 字符串
			
 
				+        jpeg_bytes = output_buffer.getvalue()
			
 
				+        b64_jpeg = base64.b64encode(jpeg_bytes).decode('utf-8')
			
 
				+
			
 
				+        # 关闭图像对象和字节流
			
 
				+        image.close()
			
 
				+        output_buffer.close()
			
 
				+
			
 
				+        if b64_jpeg:
			
 
				+            logger.info(f"tif2jpg处理成功{fileName}>>{new_fileName}")
			
 
				+            img_d["fileName"] = new_fileName
			
 
				+            img_d["suffix"] = "jpg"
			
 
				+            img_d["format"] = "jpg"
			
 
				+            img_d["imgb64"] = b64_jpeg
			
 
				+
			
 
				+    def gif2jpg(self, img_d: dict):
			
 
				+        fileName = img_d["fileName"]
			
 
				+        new_fileName = img_d['fileName'].replace(".bmp", ".jpg")
			
 
				+        b64_gif = self.get_xml_obj(fileName)
			
 
				+        image_data = base64.b64decode(b64_gif)
			
 
				+        image = Image.open(io.BytesIO(image_data))
			
 
				+        if image.mode != "RGB":
			
 
				+            image = image.convert("RGB")
			
 
				+        output_buffer = io.BytesIO()
			
 
				+        image.save(output_buffer, format="JPEG", quality=95)
			
 
				+        image.close()
			
 
				+        jpeg_bytes = output_buffer.getvalue()
			
 
				+        b64_jpeg = base64.b64encode(jpeg_bytes).decode('utf-8')
			
 
				+        output_buffer.close()
			
 
				+
			
 
				+        if b64_jpeg:
			
 
				+            logger.info(f"gif2jpg处理成功{fileName}>>{new_fileName}")
			
 
				+            img_d["fileName"] = new_fileName
			
 
				+            img_d["suffix"] = "jpg"
			
 
				+            img_d["format"] = "jpg"
			
 
				+            img_d["imgb64"] = b64_jpeg
			
 
				+
			
 
				+    def parser(self, save_dir):
			
 
				+        """
			
 
				+        解析流程
			
 
				+        doc_0默认只有 一层
			
 
				+        OFD >  Document.xml > [DocumentRes.xml, PublicRes.xml, Signatures.xml Annotations.xml] > []
			
 
				+        """
			
 
				+
			
 
				+        page_size_details = []
			
 
				+        default_page_size = []
			
 
				+        doc_list = []
			
 
				+        ofd_xml_obj = self.get_xml_obj(self.file_tree["root_doc"])  # OFD.xml xml 对象 
			
 
				+
			
 
				+        if ofd_xml_obj:
			
 
				+            ofd_obj_res = OFDFileParser(ofd_xml_obj)()
			
 
				+            doc_root_name = ofd_obj_res.get("doc_root")
			
 
				+            signatures = ofd_obj_res.get("signatures")
			
 
				+        else:
			
 
				+            # 考虑根节点丢失情况
			
 
				+            doc_root_name = ["Doc_0/Document.xml"]
			
 
				+            signatures = ["Doc_0/Signs/Signatures.xml"]
			
 
				+
			
 
				+        doc_root_xml_obj = self.get_xml_obj(doc_root_name[0])
			
 
				+        doc_root_info = DocumentFileParser(doc_root_xml_obj)()
			
 
				+        doc_page_size = self.get_page_size(doc_root_xml_obj)
			
 
				+        # print('doc_page_size', doc_page_size)
			
 
				+
			
 
				+        # 注释文本
			
 
				+        annotations_root_name = doc_root_info.get("Annotations")
			
 
				+        if annotations_root_name:
			
 
				+            annotations_root_name = annotations_root_name[0]
			
 
				+            annot_root_xml_obj = self.get_xml_obj(annotations_root_name)
			
 
				+            # print('annot_root_xml_obj', annot_root_xml_obj)
			
 
				+            annot_root_info = AnnotationFileParser(annot_root_xml_obj)()
			
 
				+            # print('annot_root_info', annot_root_info)
			
 
				+            doc_root_info.update(annot_root_info)
			
 
				+        doc_size = doc_root_info.get("size")
			
 
				+
			
 
				+        if doc_size:
			
 
				+            try:
			
 
				+                default_page_size = [float(pos_i) for pos_i in doc_size.split(" ") if re.match("[\d\.]", pos_i)]
			
 
				+            except:
			
 
				+                traceback.print_exc()
			
 
				+
			
 
				+        # 字体信息
			
 
				+        font_info = {}
			
 
				+        public_res_name: list = doc_root_info.get("public_res")
			
 
				+        if public_res_name:
			
 
				+            public_xml_obj = self.get_xml_obj(public_res_name[0])
			
 
				+            font_info = PublicResFileParser(public_xml_obj)()
			
 
				+
			
 
				+            # 注册字体
			
 
				+            for font_id, font_v in font_info.items():
			
 
				+                file_name = font_v.get("FontFile")
			
 
				+                if file_name:
			
 
				+                    font_b64 = self.get_xml_obj(file_name)
			
 
				+                    if font_b64:
			
 
				+                        font_v["font_b64"] = font_b64
			
 
				+
			
 
				+        # 图片资源
			
 
				+        img_info: dict = dict()
			
 
				+        document_res_name: list = doc_root_info.get("document_res")
			
 
				+        # print('doc_root_info', doc_root_info)
			
 
				+        if document_res_name:
			
 
				+            document_res_xml_obj = self.get_xml_obj(document_res_name[0])
			
 
				+            # print('document_res_xml_obj', document_res_xml_obj)
			
 
				+            img_info = DocumentResFileParser(document_res_xml_obj)()
			
 
				+            # 找到图片b64
			
 
				+            for img_id, img_v in img_info.items():
			
 
				+                img_v["imgb64"] = self.get_xml_obj(img_v.get("fileName"))
			
 
				+                img_v['fileName'] = f"{save_dir}Doc_0\Res\{img_v['fileName']}"
			
 
				+                # todo ib2 转png C:/msys64/mingw64/bin/jbig2dec.exe -o F:\code\easyofd\test\image_80.png F:\code\easyofd\test\image_80.jb2
			
 
				+                if img_v["suffix"] == 'jb2':
			
 
				+                    self.jb22png(img_v)
			
 
				+                elif img_v["suffix"] == 'bmp':
			
 
				+                    self.bmp2jpg(img_v)
			
 
				+                elif img_v["suffix"] == 'tif':
			
 
				+                    self.tif2jpg(img_v)
			
 
				+                elif img_v["suffix"] == 'gif':
			
 
				+                    self.gif2jpg(img_v)
			
 
				+
			
 
				+        img_info2: dict = dict()
			
 
				+        public_res_name: list = doc_root_info.get("public_res")
			
 
				+        # print('doc_root_info', doc_root_info)
			
 
				+        if public_res_name:
			
 
				+            public_res_xml_obj = self.get_xml_obj(public_res_name[0])
			
 
				+            # print('public_res_xml_obj', public_res_xml_obj)
			
 
				+            img_info2 = DocumentResFileParser(public_res_xml_obj)()
			
 
				+            # 找到图片b64
			
 
				+            for img_id, img_v in img_info2.items():
			
 
				+                img_v["imgb64"] = self.get_xml_obj(img_v.get("fileName"))
			
 
				+                # print('img_id, img_v[filename]', img_id, img_v.get('fileName'))
			
 
				+                img_v['fileName'] = f"{save_dir}Doc_0\Res\{img_v['fileName']}"
			
 
				+
			
 
				+                # todo ib2 转png C:/msys64/mingw64/bin/jbig2dec.exe -o F:\code\easyofd\test\image_80.png F:\code\easyofd\test\image_80.jb2
			
 
				+                if img_v["suffix"] == 'jb2':
			
 
				+                    self.jb22png(img_v)
			
 
				+                elif img_v["suffix"] == 'bmp':
			
 
				+                    self.bmp2jpg(img_v)
			
 
				+                elif img_v["suffix"] == 'tif':
			
 
				+                    self.tif2jpg(img_v)
			
 
				+                elif img_v["suffix"] == 'gif':
			
 
				+                    self.gif2jpg(img_v)
			
 
				+            img_info.update(img_info2)
			
 
				+
			
 
				+        page_id_map: list = doc_root_info.get("page_id_map")
			
 
				+        # print('doc_root_info', doc_root_info)
			
 
				+
			
 
				+        signatures_page_id = {}
			
 
				+        # 签章信息
			
 
				+        signatures_xml_obj = None
			
 
				+        # if signatures:
			
 
				+        #     signatures_xml_obj = self.get_xml_obj(signatures[0])
			
 
				+        # if signatures and signatures_xml_obj:
			
 
				+        # # if signatures and (signatures_xml_obj := self.get_xml_obj(signatures[0])):
			
 
				+        # #     logger.debug(f"signatures_xml_obj is {signatures_xml_obj } signatures is {signatures} ")
			
 
				+        #     signatures_info = SignaturesFileParser(signatures_xml_obj)()
			
 
				+        #     if signatures_info:  # 获取签章具体信息
			
 
				+        #         for _, signatures_cell in signatures_info.items():
			
 
				+        #             # print(signatures_info)
			
 
				+        #             BaseLoc = signatures_cell.get("BaseLoc")
			
 
				+        #             signature_xml_obj = self.get_xml_obj(BaseLoc)
			
 
				+        #             # print(BaseLoc)
			
 
				+        #             prefix = BaseLoc.split("/")[0]
			
 
				+        #             signatures_info = SignatureFileParser(signature_xml_obj)(prefix=prefix)
			
 
				+        #             # print(signatures_info)
			
 
				+        #             # logger.debug(f"signatures_info {signatures_info}")
			
 
				+        #             PageRef = signatures_info.get("PageRef")
			
 
				+        #             Boundary = signatures_info.get("Boundary")
			
 
				+        #             SignedValue = signatures_info.get("SignedValue")
			
 
				+        #             sing_page_no = page_id_map.get(PageRef)
			
 
				+        #             # print("self.file_tree",self.file_tree.keys)
			
 
				+        #             # print(page_id_map,PageRef)
			
 
				+        #             # print(SignedValue, self.get_xml_obj(SignedValue))
			
 
				+        #             # with open("b64.txt","w") as f:
			
 
				+        #             #     f.write(self.get_xml_obj(SignedValue))
			
 
				+        #             if signatures_page_id.get(sing_page_no):
			
 
				+        #                 signatures_page_id[sing_page_no].append(
			
 
				+        #                     {
			
 
				+        #                         "sing_page_no": sing_page_no,
			
 
				+        #                         "PageRef": PageRef,
			
 
				+        #                         "Boundary": Boundary,
			
 
				+        #                         "SignedValue": self.get_xml_obj(SignedValue),
			
 
				+        #                     }
			
 
				+        #                 )
			
 
				+        #             else:
			
 
				+        #                 signatures_page_id[sing_page_no] = [
			
 
				+        #                     {
			
 
				+        #                         "sing_page_no": sing_page_no,
			
 
				+        #                         "PageRef": PageRef,
			
 
				+        #                         "Boundary": Boundary,
			
 
				+        #                         "SignedValue": self.get_xml_obj(SignedValue),
			
 
				+        #                     }
			
 
				+        #                 ]
			
 
				+
			
 
				+        # 注释信息
			
 
				+        # print('doc_root_info', doc_root_info)
			
 
				+        # annotation_name: list = doc_root_info.get("Annotations")
			
 
				+        # annotation_xml_obj = None
			
 
				+        # if annotation_name:
			
 
				+        #     annotation_xml_obj = self.get_xml_obj(annotation_name[0])
			
 
				+        # if annotation_name and annotation_xml_obj:
			
 
				+        # # if annotation_name and (annotation_xml_obj:= self.get_xml_obj(annotation_name[0])):
			
 
				+        #     # todo 注释解析
			
 
				+        #
			
 
				+        #     # annotation_info = AnnotationFileParser(annotation_xml_obj)()
			
 
				+        #     annotation_info = AnnotationFileParser(annotation_xml_obj)()
			
 
				+        #     # logger.debug(f"annotation_info is {annotation_info}")
			
 
				+
			
 
				+
			
 
				+        # 正文信息 会有多页 情况
			
 
				+        page_name: list = doc_root_info.get("page")
			
 
				+        page_info_d = {}
			
 
				+        if page_name:
			
 
				+            for index, _page in enumerate(page_name):
			
 
				+                page_xml_obj = self.get_xml_obj(_page)
			
 
				+                # 重新获取页面size
			
 
				+                try:
			
 
				+                    page_size = [float(pos_i) for pos_i in
			
 
				+                                     page_xml_obj.get('ofd:Page', {}).get("ofd:Area", {}).get("ofd:PhysicalBox",
			
 
				+                                                                                              "").split(" ")
			
 
				+                                     if re.match("[\d\.]", pos_i)]
			
 
				+                    if page_size and len(page_size) >= 2:
			
 
				+                        page_size_details.append(page_size)
			
 
				+                    else:
			
 
				+                        if doc_page_size:
			
 
				+                            page_size_details.append(doc_page_size)
			
 
				+                        else:
			
 
				+                            page_size_details.append([])
			
 
				+                except Exception as e:
			
 
				+                    traceback.print_exc()
			
 
				+                    page_size.append([])
			
 
				+                page_info = ContentFileParser(page_xml_obj)()
			
 
				+                pg_no = re.search(r"\d+", _page)
			
 
				+                if pg_no:
			
 
				+                    pg_no = int(pg_no.group())
			
 
				+                else:
			
 
				+                    pg_no = index
			
 
				+                page_info_d[pg_no] = page_info
			
 
				+                # 只跑一页
			
 
				+                # print('odf_parser parser() 只跑一页')
			
 
				+                # break
			
 
				+
			
 
				+        # 注释作为正文提取
			
 
				+        annot_page_info_d = {}
			
 
				+        annot_page_name: list = doc_root_info.get("annot_page")
			
 
				+        if annot_page_name:
			
 
				+            for index, _page in enumerate(annot_page_name):
			
 
				+                annot_page_xml_obj = self.get_xml_obj(_page)
			
 
				+                annot_page_info = ContentFileParser(annot_page_xml_obj)()
			
 
				+                pg_no = re.search(r"\d+", _page)
			
 
				+                if pg_no:
			
 
				+                    pg_no = int(pg_no.group())
			
 
				+                else:
			
 
				+                    pg_no = index
			
 
				+
			
 
				+                # 重新获取页面size
			
 
				+                # try:
			
 
				+                #     page_size = [float(pos_i) for pos_i in
			
 
				+                #                  annot_page_xml_obj.get('ofd:Page', {}).get("ofd:Area", {}).get("ofd:PhysicalBox",
			
 
				+                #                                                                           "").split(" ")
			
 
				+                #                  if re.match("[\d\.]", pos_i)]
			
 
				+                #     if page_size and len(page_size) >= 2:
			
 
				+                #         # page_size_details.append(page_size)
			
 
				+                #         pass
			
 
				+                #     else:
			
 
				+                #         page_size = []
			
 
				+                # except Exception as e:
			
 
				+                #     traceback.print_exc()
			
 
				+                #     page_size.append([])
			
 
				+                page_size = self.get_page_size(annot_page_xml_obj)
			
 
				+                # if not page_size:
			
 
				+                #     page_size = doc_page_size
			
 
				+
			
 
				+                # annot_page_info['annot_page_size'] = page_size
			
 
				+                annot_page_info_d[pg_no] = annot_page_info
			
 
				+                # 只跑一页
			
 
				+                # print('odf_parser parser() 只跑一页')
			
 
				+                # break
			
 
				+        # 注释文本信息合到正文信息中
			
 
				+        for page_id, page_d in page_info_d.items():
			
 
				+            if page_id not in annot_page_info_d.keys():
			
 
				+                continue
			
 
				+            annot_page_d = annot_page_info_d.get(page_id)
			
 
				+            # print("annot_page_d.get('text_list')", annot_page_d.get('text_list'))
			
 
				+            page_d['text_list'] += annot_page_d.get('text_list')
			
 
				+            page_d['annot_text_list'] = annot_page_d.get('text_list')
			
 
				+            # page_d['annot_page_size'] = annot_page_d.get('annot_page_size')
			
 
				+        # print('page_info_d', page_info_d)
			
 
				+        # print('annot_page_info_d', annot_page_info_d)
			
 
				+
			
 
				+        # 模板信息
			
 
				+        tpls_name: list = doc_root_info.get("tpls")
			
 
				+        # if tpls_name:
			
 
				+        #     for index, _tpl in enumerate(tpls_name):
			
 
				+        #         tpl_xml_obj = self.get_xml_obj(_tpl)
			
 
				+        #         tpl_info = ContentFileParser(tpl_xml_obj)()
			
 
				+        #         tpl_no = re.search(r"\d+", _tpl)
			
 
				+        #
			
 
				+        #         if tpl_no:
			
 
				+        #             tpl_no = int(tpl_no.group())
			
 
				+        #         else:
			
 
				+        #             tpl_no = index
			
 
				+        #
			
 
				+        #         if tpl_no in page_info_d:
			
 
				+        #             page_info_d[pg_no]["text_list"].extend(tpl_info["text_list"])
			
 
				+        #             page_info_d[pg_no]["text_list"].sort(
			
 
				+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
			
 
				+        #             page_info_d[pg_no]["img_list"].extend(tpl_info["img_list"])
			
 
				+        #             page_info_d[pg_no]["img_list"].sort(
			
 
				+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
			
 
				+        #             page_info_d[pg_no]["line_list"].extend(tpl_info["line_list"])
			
 
				+        #             page_info_d[pg_no]["line_list"].sort(
			
 
				+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
			
 
				+        #         else:
			
 
				+        #             page_info_d[tpl_no] = tpl_info
			
 
				+        #             page_info_d[tpl_no].sort(
			
 
				+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
			
 
				+
			
 
				+        # todo 读取注释信息
			
 
				+        page_ID = 0  # 没遇到过doc多个的情况
			
 
				+        # print("page_info",len(page_info))
			
 
				+        doc_list.append({
			
 
				+            "default_page_size": default_page_size,
			
 
				+            "page_size": page_size_details,
			
 
				+            "pdf_name": self.file_tree["pdf_name"],
			
 
				+            "doc_no": page_ID,
			
 
				+            "images": img_info,
			
 
				+            "signatures_page_id": signatures_page_id,
			
 
				+            "page_id_map": page_id_map,
			
 
				+            "fonts": font_info,
			
 
				+            "page_info": page_info_d,
			
 
				+            "page_tpl_info": page_info_d,
			
 
				+            "page_content_info": page_info_d,
			
 
				+            # "annot_page_info": annot_page_info_d,
			
 
				+        })
			
 
				+        return doc_list
			
 
				+
			
 
				+    def get_page_size(self, page_xml_obj):
			
 
				+        try:
			
 
				+            page_size = [float(pos_i) for pos_i in page_xml_obj.get('ofd:Page', {}).get("ofd:Area", {}).get("ofd:PhysicalBox", "").split(" ")if re.match("[\d\.]", pos_i)]
			
 
				+            if not (page_size and len(page_size) >= 2):
			
 
				+                page_size = [float(pos_i) for pos_i in page_xml_obj.get('ofd:Document', {}).get('ofd:CommonData', {}).get("ofd:PageArea", {}).get("ofd:PhysicalBox", "").split(" ")if re.match("[\d\.]", pos_i)]
			
 
				+                if not (page_size and len(page_size) >= 2):
			
 
				+                    page_size = []
			
 
				+        except Exception as e:
			
 
				+            traceback.print_exc()
			
 
				+            page_size = []
			
 
				+        return page_size
			
 
				+
			
 
				+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
			
 
				+        """
			
 
				+        输出ofd解析结果
			
 
				+        """
			
 
				+        save_xml = kwargs.get("save_xml", False)
			
 
				+        xml_name = kwargs.get("xml_name")
			
 
				+        save_dir = kwargs.get("save_dir")
			
 
				+        self.file_tree = FileRead(self.ofdb64)(save_xml=save_xml, xml_name=xml_name, save_dir=save_dir)
			
 
				+        # logger.info(self.file_tree)
			
 
				+        return self.parser(save_dir)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    p = "C:/Users/Administrator/Downloads/1750060386706.ofd"
			
 
				+    with open(p, "rb") as f:
			
 
				+        ofdb64 = str(base64.b64encode(f.read()), "utf-8")
			
 
				+    obj_list = OFDParser(ofdb64)()
			
 
				+    for obj in obj_list:
			
 
				+        print('obj', obj)
			
--- a/format_convert/easyofd/easyofd/parser_ofd/parameter_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/parameter_parser.py
@@ -0,0 +1,31 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME: easyofd
			
 
				+# CREATE_TIME: 
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: renoyuan
			
 
				+# note:参数解析器
			
 
				+from loguru import logger
			
 
				+from typing import List, Dict, Any, Union, Tuple, Optional
			
 
				+
			
 
				+
			
 
				+class ParameterParser(object):
			
 
				+    parameter = {
			
 
				+        "ofd:FillColor": (dict, dict),
			
 
				+        "ofd:StrokeColor": (dict, dict),
			
 
				+        "ofd:Test": ((str, int), str),
			
 
				+        "ofd:Font": (str, str),
			
 
				+        "@Value": (str, str)
			
 
				+    }
			
 
				+
			
 
				+    def __call__(self, key, container):
			
 
				+        if key in ParameterParser.parameter:
			
 
				+            v = container.get(key, None)
			
 
				+            t = ParameterParser.parameter[key]
			
 
				+            if isinstance(v, t[0]):
			
 
				+                return v
			
 
				+            else:
			
 
				+                return t[1]()
			
 
				+        else:
			
 
				+            logger.warning(f"{key} not in ParameterParser")
			
 
				+            return None
			
--- a/format_convert/easyofd/easyofd/parser_ofd/path_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/path_parser.py
@@ -0,0 +1,61 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME:  path_parser.py
			
 
				+# CREATE_TIME: 2025/4/9 16:31
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE:
			
 
				+from enum import Enum
			
 
				+import os
			
 
				+
			
 
				+class PathType(Enum):
			
 
				+    absolutely = 1
			
 
				+    relative = 2
			
 
				+
			
 
				+class PathParser:
			
 
				+    """
			
 
				+    Parser Path
			
 
				+    路径解析器
			
 
				+    解析文件路径返回绝对路径
			
 
				+    "/ROOT/a.xml"
			
 
				+    "./ROOT/a.xml"
			
 
				+    "../ROOT/a.xml"
			
 
				+    "ROOT/a.xml"
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, root_path:str):
			
 
				+        if os.name == 'nt':
			
 
				+            self.os = "nt"
			
 
				+        else:
			
 
				+            self.os = "posix"
			
 
				+
			
 
				+        self.root_path = self.format_path(root_path)
			
 
				+
			
 
				+    def format_path(self,path:str):
			
 
				+        normalized = os.path.normpath(path)
			
 
				+        if self.os == "nt":
			
 
				+            return normalized.replace("/","\\")
			
 
				+        else:
			
 
				+            return normalized.replace("\\","/")
			
 
				+
			
 
				+    def get_path_type(self, path:str):
			
 
				+        if os.path.isabs(path):
			
 
				+            return PathType.absolutely
			
 
				+        else:
			
 
				+            return PathType.relative
			
 
				+
			
 
				+    def __call__(self,cur_path:str,loc_path:str):
			
 
				+        """
			
 
				+        loc_path is posix style
			
 
				+        """
			
 
				+        path_type = self.get_path_type(loc_path)
			
 
				+        if path_type == PathType.absolutely:
			
 
				+            return self.format_path(loc_path)
			
 
				+        if path_type == PathType.relative:
			
 
				+            if loc_path.startswith("./"):
			
 
				+                path = os.path.join(cur_path, self.format_path(loc_path[2:]))
			
 
				+            elif loc_path.startswith("../"):
			
 
				+                path = os.path.join(os.path.dirname(cur_path), self.format_path(loc_path[3:]))
			
 
				+            else:
			
 
				+                path = os.path.join(os.path.dirname(cur_path), self.format_path(loc_path))
			
 
				+            return path
			
--- a/format_convert/easyofd/easyofd/template_ofd/__init__.py
+++ b/format_convert/easyofd/easyofd/template_ofd/__init__.py
@@ -0,0 +1,7 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# PROJECT_NAME:  __init__.py.py
			
 
				+# CREATE_TIME: 2025/3/28 15:43
			
 
				+# E_MAIL: renoyuan@foxmail.com
			
 
				+# AUTHOR: reno
			
 
				+# NOTE:
			
--- a/format_convert/font_map/extend_to_normal_dict.txt
+++ b/format_convert/font_map/extend_to_normal_dict.txt
@@ -0,0 +1,53 @@
 
				+{
			
 
				+ "⺁":"厂",
			
 
				+ "⺇":"几",
			
 
				+ "⺌":"小",
			
 
				+ "⺎":"兀",
			
 
				+ "⺏":"尣",
			
 
				+ "⺐":"尢",
			
 
				+ "⺑":"𡯂",
			
 
				+ "⺒":"巳",
			
 
				+ "⺓":"幺",
			
 
				+ "⺛":"旡",
			
 
				+ "⺝":"月",
			
 
				+ "⺟":"母",
			
 
				+ "⺠":"民",
			
 
				+ "⺱":"冈",
			
 
				+ "⺸":"芈",
			
 
				+ "⻁":"虎",
			
 
				+ "⻄":"西",
			
 
				+ "⻅":"见",
			
 
				+ "⻆":"角",
			
 
				+ "⻇":"𧢲",
			
 
				+ "⻉":"贝",
			
 
				+ "⻋":"车",
			
 
				+ "⻒":"镸",
			
 
				+ "⻓":"长",
			
 
				+ "⻔":"门",
			
 
				+ "⻗":"雨",
			
 
				+ "⻘":"青",
			
 
				+ "⻙":"韦",
			
 
				+ "⻚":"页",
			
 
				+ "⻛":"风",
			
 
				+ "⻜":"飞",
			
 
				+ "⻝":"食",
			
 
				+ "⻡":"𩠐",
			
 
				+ "⻢":"马",
			
 
				+ "⻣":"骨",
			
 
				+ "⻤":"鬼",
			
 
				+ "⻥":"鱼",
			
 
				+ "⻦":"鸟",
			
 
				+ "⻧":"卤",
			
 
				+ "⻨":"麦",
			
 
				+ "⻩":"黄",
			
 
				+ "⻬":"齐",
			
 
				+ "⻮":"齿",
			
 
				+ "⻯":"竜",
			
 
				+ "⻰":"龙",
			
 
				+ "⻳":"龟",
			
 
				+ "⾅":"臼",
			
 
				+ "⼝":"口",
			
 
				+ "⼾":"户",
			
 
				+ "⼉":"儿",
			
 
				+ "⼱":"巾"
			
 
				+}
			
--- a/format_convert/font_map/kangxi_to_normal
+++ b/format_convert/font_map/kangxi_to_normal
@@ -0,0 +1,214 @@
 
				+⼀ 2F00　一 4E00
			
 
				+⼁ 2F01　丨 4E28
			
 
				+⼂ 2F02　丶 4E36
			
 
				+⼃ 2F03　丿 4E3F
			
 
				+⼄ 2F04　乙 4E59
			
 
				+⼅ 2F05　亅 4E85
			
 
				+⼆ 2F06　二 4E8C
			
 
				+⼇ 2F07　亠 4EA0
			
 
				+⼈ 2F08　人 4EBA
			
 
				+⼉ 2F09　儿 513F
			
 
				+⼊ 2F0A　入 5165
			
 
				+⼋ 2F0B　八 516B
			
 
				+⼌ 2F0C　冂 5182
			
 
				+⼍ 2F0D　冖 5196
			
 
				+⼎ 2F0E　冫 51AB　
			
 
				+⼏ 2F0F　几 51E0
			
 
				+⼐ 2F10　凵 51F5
			
 
				+⼑ 2F11　刀 5200
			
 
				+⼒ 2F12　力 529B
			
 
				+⼓ 2F13　勹 52F9
			
 
				+⼔ 2F14　匕 5315　
			
 
				+⼕ 2F15　匚 531A　
			
 
				+⼖ 2F16　匸 5338　
			
 
				+⼗ 2F17　十 5341
			
 
				+⼘ 2F18　卜 535C
			
 
				+⼙ 2F19　卩 5369
			
 
				+⼚ 2F1A　厂 5382
			
 
				+⼛ 2F1B　厶 53B6
			
 
				+⼜ 2F1C　又 53C8
			
 
				+⼝ 2F1D　口 53E3
			
 
				+⼞ 2F1E　囗 56D7
			
 
				+⼟ 2F1F　土 571F
			
 
				+⼠ 2F20　士 58EB
			
 
				+⼡ 2F21　夂 5902
			
 
				+⼢ 2F22　夊 590A
			
 
				+⼣ 2F23　夕 5915
			
 
				+⼤ 2F24　大 5927
			
 
				+⼥ 2F25　女 5973
			
 
				+⼦ 2F26　子 5B50
			
 
				+⼧ 2F27　宀 5B80
			
 
				+⼨ 2F28　寸 5BF8
			
 
				+⼩ 2F29　小 5C0F
			
 
				+⼪ 2F2A　尢 5C22
			
 
				+⼫ 2F2B　尸 5C38
			
 
				+⼬ 2F2C　屮 5C6E
			
 
				+⼭ 2F2D　山 5C71
			
 
				+⼮ 2F2E　巛 5DDB
			
 
				+⼯ 2F2F　工 5DE5
			
 
				+⼰ 2F30　己 5DF1
			
 
				+⼱ 2F31　巾 5DFE
			
 
				+⼲ 2F32　干 5E72
			
 
				+⼳ 2F33　幺 5E7A
			
 
				+⼴ 2F34　广 5E7F
			
 
				+⼵ 2F35　廴 5EF4
			
 
				+⼶ 2F36　廾 5EFE
			
 
				+⼷ 2F37　弋 5F0B
			
 
				+⼸ 2F38　弓 5F13
			
 
				+⼹ 2F39　彐 5F50
			
 
				+⼺ 2F3A　彡 5F61
			
 
				+⼻ 2F3B　彳 5F73
			
 
				+⼼ 2F3C　心 5FC3
			
 
				+⼽ 2F3D　戈 6208
			
 
				+⼾ 2F3E　戶 6236
			
 
				+⼿ 2F3F　手 624B
			
 
				+⽀ 2F40　支 652F
			
 
				+⽁ 2F41　攴 6534
			
 
				+⽂ 2F42　文 6587
			
 
				+⽃ 2F43　斗 6597
			
 
				+⽄ 2F44　斤 65A4
			
 
				+⽅ 2F45　方 65B9
			
 
				+⽆ 2F46　无 65E0
			
 
				+⽇ 2F47　日 65E5
			
 
				+⽈ 2F48　曰 66F0
			
 
				+⽉ 2F49　月 6708
			
 
				+⽊ 2F4A　木 6728
			
 
				+⽋ 2F4B　欠 6B20
			
 
				+⽌ 2F4C　止 6B62
			
 
				+⽍ 2F4D　歹 6B79
			
 
				+⽎ 2F4E　殳 6BB3
			
 
				+⽏ 2F4F　毋 6BCB
			
 
				+⽐ 2F50　比 6BD4
			
 
				+⽑ 2F51　毛 6BDB
			
 
				+⽒ 2F52　氏 6C0F
			
 
				+⽓ 2F53　气 6C14
			
 
				+⽔ 2F54　水 6C34
			
 
				+⽕ 2F55　火 706B
			
 
				+⽖ 2F56　爪 722A
			
 
				+⽗ 2F57　父 7236
			
 
				+⽘ 2F58　爻 723B
			
 
				+⽙ 2F59　爿 723F
			
 
				+⽚ 2F5A　片 7247
			
 
				+⽛ 2F5B　牙 7259
			
 
				+⽜ 2F5C　牛 725B
			
 
				+⽝ 2F5D　犬 72AC
			
 
				+⽞ 2F5E　玄 7384
			
 
				+⽟ 2F5F　玉 7389
			
 
				+⽠ 2F60　瓜 74DC
			
 
				+⽡ 2F61　瓦 74E6
			
 
				+⽢ 2F62　甘 7518
			
 
				+⽣ 2F63　生 751F
			
 
				+⽤ 2F64　用 7528
			
 
				+⽥ 2F65　田 7530
			
 
				+⽦ 2F66　疋 758B
			
 
				+⽧ 2F67　疒 7592
			
 
				+⽨ 2F68　癶 7676
			
 
				+⽩ 2F69　白 767D
			
 
				+⽪ 2F6A　皮 76AE
			
 
				+⽫ 2F6B　皿 76BF
			
 
				+⽬ 2F6C　目 76EE
			
 
				+⽭ 2F6D　矛 77DB
			
 
				+⽮ 2F6E　矢 77E2
			
 
				+⽯ 2F6F　石 77F3
			
 
				+⽰ 2F70　示 793A
			
 
				+⽱ 2F71　禸 79B8
			
 
				+⽲ 2F72　禾 79BE
			
 
				+⽳ 2F73　穴 7A74
			
 
				+⽴ 2F74　立 7ACB
			
 
				+⽵ 2F75　竹 7AF9
			
 
				+⽶ 2F76　米 7C73
			
 
				+⽷ 2F77　糸 7CF8
			
 
				+⽸ 2F78　缶 7F36
			
 
				+⽹ 2F79　网 7F51
			
 
				+⽺ 2F7A　羊 7F8A
			
 
				+⽻ 2F7B　羽 7FBD
			
 
				+⽼ 2F7C　老 8001
			
 
				+⽽ 2F7D　而 800C
			
 
				+⽾ 2F7E　耒 8012
			
 
				+⽿ 2F7F　耳 8033
			
 
				+⾀ 2F80　聿 807F
			
 
				+⾁ 2F81　肉 8089
			
 
				+⾂ 2F82　臣 81E3
			
 
				+⾃ 2F83　自 81EA
			
 
				+⾄ 2F84　至 81F3
			
 
				+⾅ 2F85　臼 81FC
			
 
				+⾆ 2F86　舌 820C
			
 
				+⾇ 2F87　舛 821B
			
 
				+⾈ 2F88　舟 821F
			
 
				+⾉ 2F89　艮 826E
			
 
				+⾊ 2F8A　色 8272
			
 
				+⾋ 2F8B　艸 8278
			
 
				+⾌ 2F8C　虍 864D
			
 
				+⾍ 2F8D　虫 866B
			
 
				+⾎ 2F8E　血 8840
			
 
				+⾏ 2F8F　行 884C
			
 
				+⾐ 2F90　衣 8863
			
 
				+⾑ 2F91　襾 897E
			
 
				+⾒ 2F92　見 898B
			
 
				+⾓ 2F93　角 89D2
			
 
				+⾔ 2F94　言 8A00
			
 
				+⾕ 2F95　谷 8C37
			
 
				+⾖ 2F96　豆 8C46
			
 
				+⾗ 2F97　豕 8C55
			
 
				+⾘ 2F98　豸 8C78
			
 
				+⾙ 2F99　貝 8C9D
			
 
				+⾚ 2F9A　赤 8D64
			
 
				+⾛ 2F9B　走 8D70
			
 
				+⾜ 2F9C　足 8DB3
			
 
				+⾝ 2F9D　身 8EAB
			
 
				+⾞ 2F9E　車 8ECA
			
 
				+⾟ 2F9F　辛 8F9B
			
 
				+⾠ 2FA0　辰 8FB0
			
 
				+⾡ 2FA1　辵 8FB5
			
 
				+⾢ 2FA2　邑 9091
			
 
				+⾣ 2FA3　酉 9149
			
 
				+⾤ 2FA4　采 91C7
			
 
				+⾥ 2FA5　里 91CC
			
 
				+⾦ 2FA6　金 91D1
			
 
				+⾧ 2FA7　長 9577
			
 
				+⾨ 2FA8　門 9580
			
 
				+⾩ 2FA9　阜 961C
			
 
				+⾪ 2FAA　隶 96B6
			
 
				+⾫ 2FAB　隹 96B9
			
 
				+⾬ 2FAC　雨 96E8
			
 
				+⾭ 2FAD　青 9752
			
 
				+⾮ 2FAE　非 975E
			
 
				+⾯ 2FAF　面 9762
			
 
				+⾰ 2FB0　革 9769
			
 
				+⾱ 2FB1　韋 97CB
			
 
				+⾲ 2FB2　韭 97ED
			
 
				+⾳ 2FB3　音 97F3
			
 
				+⾴ 2FB4　頁 9801
			
 
				+⾵ 2FB5　風 98A8
			
 
				+⾶ 2FB6　飛 98DB
			
 
				+⾷ 2FB7　食 98DF
			
 
				+⾸ 2FB8　首 9996
			
 
				+⾹ 2FB9　香 9999
			
 
				+⾺ 2FBA　馬 99AC
			
 
				+⾻ 2FBB　骨 9AA8
			
 
				+⾼ 2FBC　高 9AD8
			
 
				+⾽ 2FBD　髟 9ADF
			
 
				+⾾ 2FBE　鬥 9B25
			
 
				+⾿ 2FBF　鬯 9B2F
			
 
				+⿀ 2FC0　鬲 9B32
			
 
				+⿁ 2FC1　鬼 9B3C
			
 
				+⿂ 2FC2　魚 9B5A
			
 
				+⿃ 2FC3　鳥 9CE5
			
 
				+⿄ 2FC4　鹵 9E75
			
 
				+⿅ 2FC5　鹿 9E7F
			
 
				+⿆ 2FC6　麥 9EA5
			
 
				+⿇ 2FC7　麻 9EBB
			
 
				+⿈ 2FC8　黃 9EC3
			
 
				+⿉ 2FC9　黍 9ECD
			
 
				+⿊ 2FCA　黑 9ED1
			
 
				+⿋ 2FCB　黹 9EF9
			
 
				+⿌ 2FCC　黽 9EFD
			
 
				+⿍ 2FCD　鼎 9F0E
			
 
				+⿎ 2FCE　鼓 9F13
			
 
				+⿏ 2FCF　鼠 9F20
			
 
				+⿐ 2FD0　鼻 9F3B
			
 
				+⿑ 2FD1　齊 9F4A
			
 
				+⿒ 2FD2　齒 9F52
			
 
				+⿓ 2FD3　龍 9F8D
			
 
				+⿔ 2FD4　龜 9F9C
			
 
				+⿕ 2FD5　龠 9FA0
			
--- a/format_convert/font_map/kangxi_to_normal_dict.txt
+++ b/format_convert/font_map/kangxi_to_normal_dict.txt
@@ -0,0 +1,154 @@
 
				+{
			
 
				+    "⼀": "一",
			
 
				+    "⼄": "乙",
			
 
				+    "⼆": "二",
			
 
				+    "⼈": "人",
			
 
				+    "⼉": "儿",
			
 
				+    "⼊": "入",
			
 
				+    "⼋": "八",
			
 
				+    "⼏": "几",
			
 
				+    "⼑": "刀",
			
 
				+    "⼒": "力",
			
 
				+    "⼔": "匕",
			
 
				+    "⼗": "十",
			
 
				+    "⼘": "卜",
			
 
				+    "⼚": "厂",
			
 
				+    "⼜": "又",
			
 
				+    "⼝": "口",
			
 
				+    "⼞": "口",
			
 
				+    "⼟": "土",
			
 
				+    "⼠": "士",
			
 
				+    "⼤": "大",
			
 
				+    "⼥": "女",
			
 
				+    "⼦": "子",
			
 
				+    "⼨": "寸",
			
 
				+    "⼩": "小",
			
 
				+    "⼫": "尸",
			
 
				+    "⼭": "山",
			
 
				+    "⼯": "工",
			
 
				+    "⼰": "己",
			
 
				+    "⼲": "干",
			
 
				+    "⼴": "广",
			
 
				+    "⼸": "弓",
			
 
				+    "⼼": "心",
			
 
				+    "⼽": "戈",
			
 
				+    "⼿": "手",
			
 
				+    "⽀": "支",
			
 
				+    "⽂": "文",
			
 
				+    "⽃": "斗",
			
 
				+    "⽄": "斤",
			
 
				+    "⽅": "方",
			
 
				+    "⽆": "无",
			
 
				+    "⽇": "日",
			
 
				+    "⽈": "曰",
			
 
				+    "⽉": "月",
			
 
				+    "⽊": "木",
			
 
				+    "⽋": "欠",
			
 
				+    "⽌": "止",
			
 
				+    "⽍": "歹",
			
 
				+    "⽏": "毋",
			
 
				+    "⽐": "比",
			
 
				+    "⽑": "毛",
			
 
				+    "⽒": "氏",
			
 
				+    "⽓": "气",
			
 
				+    "⽔": "水",
			
 
				+    "⽕": "火",
			
 
				+    "⽖": "爪",
			
 
				+    "⽗": "父",
			
 
				+    "⽚": "片",
			
 
				+    "⽛": "牙",
			
 
				+    "⽜": "牛",
			
 
				+    "⽝": "犬",
			
 
				+    "⽞": "玄",
			
 
				+    "⽟": "玉",
			
 
				+    "⽠": "瓜",
			
 
				+    "⽡": "瓦",
			
 
				+    "⽢": "甘",
			
 
				+    "⽣": "生",
			
 
				+    "⽤": "用",
			
 
				+    "⽥": "田",
			
 
				+    "⽩": "白",
			
 
				+    "⽪": "皮",
			
 
				+    "⽫": "皿",
			
 
				+    "⽬": "目",
			
 
				+    "⽭": "矛",
			
 
				+    "⽮": "矢",
			
 
				+    "⽯": "石",
			
 
				+    "⽰": "示",
			
 
				+    "⽲": "禾",
			
 
				+    "⽳": "穴",
			
 
				+    "⽴": "立",
			
 
				+    "⽵": "竹",
			
 
				+    "⽶": "米",
			
 
				+    "⽸": "缶",
			
 
				+    "⽹": "网",
			
 
				+    "⽺": "羊",
			
 
				+    "⽻": "羽",
			
 
				+    "⽼": "老",
			
 
				+    "⽽": "而",
			
 
				+    "⽿": "耳",
			
 
				+    "⾁": "肉",
			
 
				+    "⾂": "臣",
			
 
				+    "⾃": "自",
			
 
				+    "⾄": "至",
			
 
				+    "⾆": "舌",
			
 
				+    "⾈": "舟",
			
 
				+    "⾉": "艮",
			
 
				+    "⾊": "色",
			
 
				+    "⾍": "虫",
			
 
				+    "⾎": "血",
			
 
				+    "⾏": "行",
			
 
				+    "⾐": "衣",
			
 
				+    "⾒": "儿",
			
 
				+    "⾓": "角",
			
 
				+    "⾔": "言",
			
 
				+    "⾕": "谷",
			
 
				+    "⾖": "豆",
			
 
				+    "⾚": "赤",
			
 
				+    "⾛": "走",
			
 
				+    "⾜": "足",
			
 
				+    "⾝": "身",
			
 
				+    "⾞": "车",
			
 
				+    "⾟": "辛",
			
 
				+    "⾠": "辰",
			
 
				+    "⾢": "邑",
			
 
				+    "⾣": "酉",
			
 
				+    "⾤": "采",
			
 
				+    "⾥": "里",
			
 
				+    "⾦": "金",
			
 
				+    "⾧": "长",
			
 
				+    "⾨": "门",
			
 
				+    "⾩": "阜",
			
 
				+    "⾪": "隶",
			
 
				+    "⾬": "雨",
			
 
				+    "⾭": "青",
			
 
				+    "⾮": "非",
			
 
				+    "⾯": "面",
			
 
				+    "⾰": "革",
			
 
				+    "⾲": "韭",
			
 
				+    "⾳": "音",
			
 
				+    "⾴": "页",
			
 
				+    "⾵": "风",
			
 
				+    "⾶": "飞",
			
 
				+    "⾷": "食",
			
 
				+    "⾸": "首",
			
 
				+    "⾹": "香",
			
 
				+    "⾺": "马",
			
 
				+    "⾻": "骨",
			
 
				+    "⾼": "高",
			
 
				+    "⿁": "鬼",
			
 
				+    "⿂": "鱼",
			
 
				+    "⿃": "鸟",
			
 
				+    "⿄": "卤",
			
 
				+    "⿅": "鹿",
			
 
				+    "⿇": "麻",
			
 
				+    "⿉": "黍",
			
 
				+    "⿊": "黑",
			
 
				+    "⿍": "鼎",
			
 
				+    "⿎": "鼓",
			
 
				+    "⿏": "鼠",
			
 
				+    "⿐": "鼻",
			
 
				+    "⿒": "齿",
			
 
				+    "⿓": "龙",
			
 
				+    "⼣": "夕"
			
 
				+}
			
--- a/format_convert/ofd/ofd_parser.py
+++ b/format_convert/ofd/ofd_parser.py
@@ -0,0 +1,327 @@
 
				+import os
			
 
				+import zipfile
			
 
				+import xml.etree.ElementTree as ET
			
 
				+from typing import Dict, List, Any, Optional
			
 
				+from pathlib import Path
			
 
				+
			
 
				+
			
 
				+class OFDParser:
			
 
				+    """OFD文件解析器"""
			
 
				+
			
 
				+    def __init__(self, ofd_path: str):
			
 
				+        """初始化解析器并验证OFD文件"""
			
 
				+        self.ofd_path = ofd_path
			
 
				+        self.temp_dir = Path("./ofd_temp")
			
 
				+        self.ofd_info = {}
			
 
				+        self.documents = []
			
 
				+
			
 
				+        if not os.path.exists(ofd_path):
			
 
				+            raise FileNotFoundError(f"OFD文件不存在: {ofd_path}")
			
 
				+
			
 
				+        if not zipfile.is_zipfile(ofd_path):
			
 
				+            raise ValueError(f"文件不是有效的OFD文件(Zip格式): {ofd_path}")
			
 
				+
			
 
				+    def parse(self) -> Dict[str, Any]:
			
 
				+        """解析OFD文件并返回内容结构"""
			
 
				+        try:
			
 
				+            self._extract_ofd()
			
 
				+            self._parse_ofd_xml()
			
 
				+            self._parse_documents()
			
 
				+            return {
			
 
				+                "file_info": self.ofd_info,
			
 
				+                "documents": self.documents
			
 
				+            }
			
 
				+        finally:
			
 
				+            self._cleanup()
			
 
				+
			
 
				+    def _extract_ofd(self) -> None:
			
 
				+        """解压OFD文件到临时目录"""
			
 
				+        self.temp_dir.mkdir(exist_ok=True)
			
 
				+        with zipfile.ZipFile(self.ofd_path, 'r') as zip_ref:
			
 
				+            zip_ref.extractall(self.temp_dir)
			
 
				+
			
 
				+    def _parse_ofd_xml(self) -> None:
			
 
				+        """解析OFD.xml文件获取基本信息"""
			
 
				+        ofd_xml_path = self.temp_dir / "OFD.xml"
			
 
				+        if not ofd_xml_path.exists():
			
 
				+            raise ValueError("OFD.xml文件缺失")
			
 
				+
			
 
				+        root = ET.parse(ofd_xml_path).getroot()
			
 
				+        namespace = {'ofd': 'http://www.ofdspec.org/2016'}
			
 
				+
			
 
				+        # 解析文档基本信息
			
 
				+        doc_body = root.find('ofd:DocBody', namespace)
			
 
				+        if doc_body is not None:
			
 
				+            # 解析文档根信息
			
 
				+            doc_file = doc_body.find('ofd:DocFile', namespace)
			
 
				+            if doc_file is not None:
			
 
				+                self.ofd_info['doc_file'] = doc_file.text
			
 
				+
			
 
				+            # 解析签名信息
			
 
				+            signatures = doc_body.find('ofd:Signatures', namespace)
			
 
				+            if signatures is not None:
			
 
				+                self.ofd_info['signatures'] = {
			
 
				+                    'file': signatures.get('FileRef'),
			
 
				+                    'count': int(signatures.get('Count', 0))
			
 
				+                }
			
 
				+
			
 
				+    def _parse_documents(self) -> None:
			
 
				+        """解析文档内容"""
			
 
				+        # 获取所有Document.xml文件
			
 
				+        doc_xml_files = list(self.temp_dir.rglob("Document.xml"))
			
 
				+        for doc_xml in doc_xml_files:
			
 
				+            doc_info = self._parse_document(doc_xml)
			
 
				+            self.documents.append(doc_info)
			
 
				+
			
 
				+    def _parse_document(self, doc_xml_path: Path) -> Dict[str, Any]:
			
 
				+        """解析单个文档"""
			
 
				+        namespace = {'ofd': 'http://www.ofdspec.org/2016'}
			
 
				+        root = ET.parse(doc_xml_path).getroot()
			
 
				+
			
 
				+        document = {
			
 
				+            'path': str(doc_xml_path),
			
 
				+            'pages': [],
			
 
				+            'fonts': self._parse_fonts(root, namespace),
			
 
				+            'metadata': self._parse_metadata(root, namespace)
			
 
				+        }
			
 
				+
			
 
				+        # 解析页面信息
			
 
				+        pages_node = root.find('.//ofd:Pages', namespace)
			
 
				+        if pages_node is not None:
			
 
				+            page_references = pages_node.findall('ofd:Page', namespace)
			
 
				+            for page_ref in page_references:
			
 
				+                page_id = page_ref.get('ID')
			
 
				+                page_file = page_ref.find('ofd:PageFile', namespace)
			
 
				+                if page_file is not None:
			
 
				+                    page_path = self.temp_dir / page_file.text
			
 
				+                    if page_path.exists():
			
 
				+                        page_info = self._parse_page(page_path)
			
 
				+                        document['pages'].append({
			
 
				+                            'id': page_id,
			
 
				+                            'content': page_info
			
 
				+                        })
			
 
				+
			
 
				+        return document
			
 
				+
			
 
				+    def _parse_fonts(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, str]]:
			
 
				+        """解析文档字体信息"""
			
 
				+        fonts = []
			
 
				+        font_list = root.find('.//ofd:Fonts', ns)
			
 
				+        if font_list is not None:
			
 
				+            for font_node in font_list.findall('ofd:Font', ns):
			
 
				+                font = {
			
 
				+                    'id': font_node.get('ID'),
			
 
				+                    'name': font_node.get('FontName'),
			
 
				+                    'family': font_node.get('FamilyName'),
			
 
				+                    'format': font_node.get('FontFormat'),
			
 
				+                    'bold': font_node.get('Bold') == 'true',
			
 
				+                    'italic': font_node.get('Italic') == 'true',
			
 
				+                    'serif': font_node.get('Serif') == 'true',
			
 
				+                    'fixed_width': font_node.get('FixedWidth') == 'true'
			
 
				+                }
			
 
				+                fonts.append(font)
			
 
				+        return fonts
			
 
				+
			
 
				+    def _parse_metadata(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, str]:
			
 
				+        """解析文档元数据"""
			
 
				+        metadata = {}
			
 
				+        doc_info = root.find('.//ofd:DocInfo', ns)
			
 
				+        if doc_info is not None:
			
 
				+            for attr in ['Title', 'Author', 'Subject', 'Keywords', 'Creator',
			
 
				+                         'CreatorVersion', 'CreationDate', 'ModDate']:
			
 
				+                element = doc_info.find(f'ofd:{attr}', ns)
			
 
				+                if element is not None and element.text:
			
 
				+                    metadata[attr] = element.text
			
 
				+        return metadata
			
 
				+
			
 
				+    def _parse_page(self, page_path: Path) -> Dict[str, Any]:
			
 
				+        """解析页面内容"""
			
 
				+        namespace = {
			
 
				+            'ofd': 'http://www.ofdspec.org/2016',
			
 
				+            'ofdtext': 'http://www.ofdspec.org/2016',
			
 
				+            'ofdgraph': 'http://www.ofdspec.org/2016',
			
 
				+            'ofdimg': 'http://www.ofdspec.org/2016'
			
 
				+        }
			
 
				+        root = ET.parse(page_path).getroot()
			
 
				+
			
 
				+        page = {
			
 
				+            'size': self._parse_page_size(root, namespace),
			
 
				+            'text_content': self._extract_text_content(root, namespace),
			
 
				+            'images': self._extract_images(root, namespace),
			
 
				+            'graphics': self._extract_graphics(root, namespace),
			
 
				+            'layers': self._parse_layers(root, namespace)
			
 
				+        }
			
 
				+
			
 
				+        return page
			
 
				+
			
 
				+    def _parse_page_size(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, float]:
			
 
				+        """解析页面尺寸"""
			
 
				+        box = root.find('.//ofd:Area/ofd:PhysicalBox', ns)
			
 
				+        if box is not None:
			
 
				+            return {
			
 
				+                'width': float(box.get('Width', 0)),
			
 
				+                'height': float(box.get('Height', 0)),
			
 
				+                'x': float(box.get('x', 0)),
			
 
				+                'y': float(box.get('y', 0))
			
 
				+            }
			
 
				+        return {'width': 0, 'height': 0, 'x': 0, 'y': 0}
			
 
				+
			
 
				+    def _extract_text_content(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
			
 
				+        """提取页面文本内容，包含位置和样式信息"""
			
 
				+        text_objects = root.findall('.//ofdtext:TextObject', ns)
			
 
				+        texts = []
			
 
				+
			
 
				+        for text_obj in text_objects:
			
 
				+            # 获取文本对象的基本属性
			
 
				+            text_info = {
			
 
				+                'id': text_obj.get('ID'),
			
 
				+                'bounding_box': {
			
 
				+                    'x': float(text_obj.get('BoundaryBox').split()[0]),
			
 
				+                    'y': float(text_obj.get('BoundaryBox').split()[1]),
			
 
				+                    'width': float(text_obj.get('BoundaryBox').split()[2]),
			
 
				+                    'height': float(text_obj.get('BoundaryBox').split()[3])
			
 
				+                },
			
 
				+                'transform': text_obj.get('CTM'),
			
 
				+                'content': []
			
 
				+            }
			
 
				+
			
 
				+            # 获取文本样式
			
 
				+            style = text_obj.find('ofdtext:TextStyle', ns)
			
 
				+            if style is not None:
			
 
				+                text_info['style'] = {
			
 
				+                    'font': style.get('Font'),
			
 
				+                    'size': float(style.get('Size', 0)),
			
 
				+                    'color': style.get('FillColor'),
			
 
				+                    'weight': style.get('Weight'),
			
 
				+                    'italic': style.get('Italic') == 'true',
			
 
				+                    'underline': style.get('Underline') == 'true',
			
 
				+                    'strikeout': style.get('StrikeOut') == 'true'
			
 
				+                }
			
 
				+
			
 
				+            # 提取实际文本内容
			
 
				+            text_codecs = text_obj.findall('ofdtext:TextCode', ns)
			
 
				+            for codec in text_codecs:
			
 
				+                if codec.text:
			
 
				+                    text_info['content'].append({
			
 
				+                        'text': codec.text.strip(),
			
 
				+                        'position': {
			
 
				+                            'x': float(codec.get('X', 0)),
			
 
				+                            'y': float(codec.get('Y', 0))
			
 
				+                        }
			
 
				+                    })
			
 
				+
			
 
				+            if text_info['content']:
			
 
				+                texts.append(text_info)
			
 
				+
			
 
				+        return texts
			
 
				+
			
 
				+    def _extract_images(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
			
 
				+        """提取页面中的图像信息"""
			
 
				+        images = []
			
 
				+        image_objects = root.findall('.//ofdimg:ImageObject', ns)
			
 
				+
			
 
				+        for img_obj in image_objects:
			
 
				+            image = {
			
 
				+                'id': img_obj.get('ID'),
			
 
				+                'bounding_box': {
			
 
				+                    'x': float(img_obj.get('BoundaryBox').split()[0]),
			
 
				+                    'y': float(img_obj.get('BoundaryBox').split()[1]),
			
 
				+                    'width': float(img_obj.get('BoundaryBox').split()[2]),
			
 
				+                    'height': float(img_obj.get('BoundaryBox').split()[3])
			
 
				+                },
			
 
				+                'resource_id': img_obj.get('ResourceID'),
			
 
				+                'transform': img_obj.get('CTM')
			
 
				+            }
			
 
				+            images.append(image)
			
 
				+
			
 
				+        return images
			
 
				+
			
 
				+    def _extract_graphics(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
			
 
				+        """提取页面中的图形信息"""
			
 
				+        graphics = []
			
 
				+        graphic_objects = root.findall('.//ofdgraph:PathObject', ns)
			
 
				+
			
 
				+        for graphic_obj in graphic_objects:
			
 
				+            graphic = {
			
 
				+                'id': graphic_obj.get('ID'),
			
 
				+                'bounding_box': {
			
 
				+                    'x': float(graphic_obj.get('BoundaryBox').split()[0]),
			
 
				+                    'y': float(graphic_obj.get('BoundaryBox').split()[1]),
			
 
				+                    'width': float(graphic_obj.get('BoundaryBox').split()[2]),
			
 
				+                    'height': float(graphic_obj.get('BoundaryBox').split()[3])
			
 
				+                },
			
 
				+                'fill_color': graphic_obj.get('FillColor'),
			
 
				+                'stroke_color': graphic_obj.get('StrokeColor'),
			
 
				+                'line_width': float(graphic_obj.get('LineWidth', 0)),
			
 
				+                'path_data': graphic_obj.find('ofdgraph:PathData', ns).text if graphic_obj.find('ofdgraph:PathData',
			
 
				+                                                                                                ns) is not None else ''
			
 
				+            }
			
 
				+            graphics.append(graphic)
			
 
				+
			
 
				+        return graphics
			
 
				+
			
 
				+    def _parse_layers(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
			
 
				+        """解析页面图层信息"""
			
 
				+        layers = []
			
 
				+        layer_nodes = root.findall('.//ofd:Layer', ns)
			
 
				+
			
 
				+        for layer in layer_nodes:
			
 
				+            layer_info = {
			
 
				+                'type': layer.get('Type'),
			
 
				+                'objects': {
			
 
				+                    'text': len(layer.findall('.//ofdtext:TextObject', ns)),
			
 
				+                    'images': len(layer.findall('.//ofdimg:ImageObject', ns)),
			
 
				+                    'graphics': len(layer.findall('.//ofdgraph:PathObject', ns))
			
 
				+                }
			
 
				+            }
			
 
				+            layers.append(layer_info)
			
 
				+
			
 
				+        return layers
			
 
				+
			
 
				+    def _cleanup(self) -> None:
			
 
				+        """清理临时文件"""
			
 
				+        import shutil
			
 
				+        # if self.temp_dir.exists():
			
 
				+        #     shutil.rmtree(self.temp_dir)
			
 
				+
			
 
				+
			
 
				+# 使用示例
			
 
				+if __name__ == "__main__":
			
 
				+    try:
			
 
				+        p = "C:/Users/Administrator/Downloads/1750060386706.ofd"
			
 
				+        parser = OFDParser(p)
			
 
				+        result = parser.parse()
			
 
				+
			
 
				+        # 打印文档基本信息
			
 
				+        print("文档信息:", result["file_info"])
			
 
				+
			
 
				+        # 打印所有页面的文本内容
			
 
				+        for doc_idx, document in enumerate(result["documents"], 1):
			
 
				+            print(f"\n文档 {doc_idx}:")
			
 
				+            print(f"  字体数量: {len(document['fonts'])}")
			
 
				+            print(f"  页面数量: {len(document['pages'])}")
			
 
				+
			
 
				+            # 打印文档元数据
			
 
				+            if document['metadata']:
			
 
				+                print("  元数据:")
			
 
				+                for key, value in document['metadata'].items():
			
 
				+                    print(f"    {key}: {value}")
			
 
				+
			
 
				+            # 打印页面内容摘要
			
 
				+            for page_idx, page in enumerate(document["pages"], 1):
			
 
				+                print(f"\n  页面 {page_idx}:")
			
 
				+                print(f"    尺寸: {page['content']['size']['width']} x {page['content']['size']['height']}")
			
 
				+                print(f"    文本元素: {len(page['content']['text_content'])}")
			
 
				+                print(f"    图像元素: {len(page['content']['images'])}")
			
 
				+                print(f"    图形元素: {len(page['content']['graphics'])}")
			
 
				+                print(f"    图层数量: {len(page['content']['layers'])}")
			
 
				+
			
 
				+                # 打印前5行文本
			
 
				+                if page['content']['text_content']:
			
 
				+                    print("    前5行文本:")
			
 
				+                    for i, text_elem in enumerate(page['content']['text_content'][:5]):
			
 
				+                        text_lines = " ".join([t['text'] for t in text_elem['content']])
			
 
				+                        print(f"      {i + 1}. {text_lines[:50]}{'...' if len(text_lines) > 50 else ''}")
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(f"解析OFD文件时出错: {e}")
			
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
@@ -9,13 +9,18 @@ import pickle
 
				 import socket
			
 
				 import subprocess
			
 
				 import sys
			
 
				+from glob import glob
			
 
				 from io import BytesIO
			
 
				 from subprocess import Popen
			
 
				+import pynvml
			
 
				+import datetime
			
 
				+import PyPDF2
			
 
				 from shapely.geometry import LineString
			
 
				 import cv2
			
 
				 import requests
			
 
				 from PIL import Image
			
 
				-
			
 
				+from reportlab.pdfbase import pdfmetrics
			
 
				+from reportlab.pdfbase.ttfonts import TTFont
			
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
 
				 import difflib
			
 
				 import logging
			
@@ -43,6 +48,14 @@ from shapely.geometry import Polygon
 
				 
			
 
				 config_file_path = os.path.dirname(os.path.abspath(__file__)) + "/../config/interface_new.yml"
			
 
				 
			
 
				+# 特殊中文转基本中文
			
 
				+with open(os.path.abspath(os.path.dirname(__file__)) + '/font_map/extend_to_normal_dict.txt', 'r', encoding='utf-8') as f:
			
 
				+    extend_to_normal_dict = f.read()
			
 
				+    extend_to_normal_dict = eval(extend_to_normal_dict)
			
 
				+with open(os.path.abspath(os.path.dirname(__file__)) + '/font_map/kangxi_to_normal_dict.txt', 'r', encoding='utf-8') as f:
			
 
				+    kangxi_to_normal_dict = f.read()
			
 
				+    kangxi_to_normal_dict = eval(kangxi_to_normal_dict)
			
 
				+
			
 
				 
			
 
				 def has_intersection(poly1, poly2):
			
 
				     """
			
@@ -62,7 +75,7 @@ def has_intersection(poly1, poly2):
 
				 
			
 
				 
			
 
				 def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13,
			
 
				-                                  -14, -15, -16, -17, -18, -19, -20, -21, -22]):
			
 
				+                                  -14, -15, -16, -17, -18, -19, -20, -21, -22, -23]):
			
 
				     """
			
 
				     [0] : continue
			
 
				     [-1]: 逻辑处理错误
			
@@ -87,6 +100,7 @@ def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -1
 
				     [-20]: requests请求超时
			
 
				     [-21]: requests请求返回错误状态码
			
 
				     [-22]: requests请求拒绝连接
			
 
				+    [-23]: 两列无边框表格提取报错
			
 
				     """
			
 
				     for c in code:
			
 
				         if isinstance(_list, list) and _list == [c]:
			
@@ -366,11 +380,45 @@ def slash_replace(_str, reverse=False):
 
				     return _str
			
 
				 
			
 
				 
			
 
				+def align_table_lines(line_list, threshold=7):
			
 
				+    """
			
 
				+    对齐横线竖线，包括越过合并单元格的线
			
 
				+    否则在生成表格时会因为线错位出错
			
 
				+
			
 
				+    :return:
			
 
				+    """
			
 
				+    rows = []
			
 
				+    cols = []
			
 
				+    for line in line_list:
			
 
				+        x0, y0, x1, y1 = line.bbox
			
 
				+        if abs(x0-x1) > abs(y0-y1):
			
 
				+            rows.append(line)
			
 
				+        else:
			
 
				+            cols.append(line)
			
 
				+    if not rows or not cols:
			
 
				+        return line_list
			
 
				+
			
 
				+    rows.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
			
 
				+    last_line = rows[0]
			
 
				+    for line in rows[1:]:
			
 
				+        if abs(line.bbox[1] - last_line.bbox[1]) <= threshold and line.bbox[1] != last_line.bbox[1]:
			
 
				+            last_line.bbox = (last_line.bbox[0], line.bbox[1], last_line.bbox[2], line.bbox[3])
			
 
				+        last_line = line
			
 
				+
			
 
				+    cols.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
			
 
				+    last_line = cols[0]
			
 
				+    for line in cols[1:]:
			
 
				+        if abs(line.bbox[0] - last_line.bbox[0]) <= threshold and line.bbox[0] != last_line.bbox[0]:
			
 
				+            last_line.bbox = (line.bbox[0], last_line.bbox[1], line.bbox[2], last_line.bbox[3])
			
 
				+        last_line = line
			
 
				+    line_list = rows + cols
			
 
				+    return line_list
			
 
				+
			
 
				+
			
 
				 class LineTable:
			
 
				     def recognize_table(self, list_textbox, list_line, sourceP_LB=False,
			
 
				                         splited=False, from_pdf=False, is_reverse=False, show=0):
			
 
				         self.list_line = list_line
			
 
				-        self.list_crosspoints = self.recognize_crosspoints(list_line)
			
 
				         self.from_pdf = from_pdf
			
 
				         self.splited = splited
			
 
				         self.connect_bbox_list = []
			
@@ -381,6 +429,13 @@ class LineTable:
 
				             # 展示原始表格及文字
			
 
				             self._plot(list_line, list_textbox, title='list_line,list_textbox')
			
 
				 
			
 
				+        list_line = align_table_lines(list_line)
			
 
				+        if self.show:
			
 
				+            self._plot(list_line, list_textbox, title='align_table_lines')
			
 
				+
			
 
				+        # 获取交点
			
 
				+        self.list_crosspoints = self.recognize_crosspoints(list_line)
			
 
				+
			
 
				         # 聚类
			
 
				         cluster_crosspoints = []
			
 
				         for _point in self.list_crosspoints:
			
@@ -1189,6 +1244,15 @@ class LineTable:
 
				 
			
 
				     def fix_rect(self, _table, list_x, list_y, sourceP_LB, margin):
			
 
				         self.fix_span(_table, list_x, list_y, sourceP_LB)
			
 
				+        if self.show:
			
 
				+            # 打印_table
			
 
				+            temp_list = []
			
 
				+            for t in _table:
			
 
				+                print('------ fix_span row ------')
			
 
				+                for c in t:
			
 
				+                    print('fix_span col', c)
			
 
				+                    temp_list.append(c)
			
 
				+            self._plot([], [], temp_list, title='fix_span table')
			
 
				 
			
 
				         for _line in _table:
			
 
				             _line.sort(key=lambda x: x.get('bbox')[0])
			
@@ -1646,7 +1710,7 @@ def sort_object(obj_list, is_reverse=False):
 
				     if len(obj_list) == 0:
			
 
				         return obj_list
			
 
				     if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
			
 
				-        obj_list.sort(key=lambda x: (x.y, x.x), reverse=is_reverse)
			
 
				+        obj_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]), reverse=is_reverse)
			
 
				         return obj_list
			
 
				     elif isinstance(obj_list[0], _Page):
			
 
				         obj_list.sort(key=lambda x: x.page_no)
			
@@ -2544,6 +2608,237 @@ def dynamic_get_port(start_port, mode='-1', num=10):
 
				     return None
			
 
				 
			
 
				 
			
 
				+def text_bbox_to_lt(text_list, bbox_list):
			
 
				+    from format_convert.convert_tree import TextBox
			
 
				+    lt_text_box_list = []
			
 
				+    for i in range(len(bbox_list)):
			
 
				+        bbox = bbox_list[i]
			
 
				+        b_text = text_list[i]
			
 
				+        lt_text_box_list.append(TextBox([bbox[0][0], bbox[0][1], bbox[2][0], bbox[2][1]], b_text))
			
 
				+    return lt_text_box_list
			
 
				+
			
 
				+
			
 
				+def extract_one_page_pdf(input_pdf_path, output_pdf_path, page_no):
			
 
				+    try:
			
 
				+        # 打开源PDF文件
			
 
				+        with open(input_pdf_path, 'rb') as input_file:
			
 
				+            pdf_reader = PyPDF2.PdfFileReader(input_file)
			
 
				+
			
 
				+            # 检查页码是否有效
			
 
				+            if page_no < 0 or page_no >= len(pdf_reader.pages):
			
 
				+                print("页码超出范围")
			
 
				+                return
			
 
				+
			
 
				+            # 创建一个新的PDF写入对象
			
 
				+            pdf_writer = PyPDF2.PdfFileWriter()
			
 
				+
			
 
				+            # 添加指定页到写入对象
			
 
				+            pdf_writer.addPage(pdf_reader.pages[page_no])
			
 
				+
			
 
				+            # 将新的PDF写入到输出文件
			
 
				+            with open(output_pdf_path, 'wb') as output_file:
			
 
				+                pdf_writer.write(output_file)
			
 
				+
			
 
				+        print(f"成功提取第 {page_no + 1} 页并保存为 {output_pdf_path}")
			
 
				+    except Exception as e:
			
 
				+        print(f"提取页面失败：{e}")
			
 
				+
			
 
				+
			
 
				+def get_gpu_memory_usage():
			
 
				+    try:
			
 
				+        # 初始化 NVML
			
 
				+        pynvml.nvmlInit()
			
 
				+        # 获取 GPU 设备数量
			
 
				+        device_count = pynvml.nvmlDeviceGetCount()
			
 
				+        # 获取当前时间
			
 
				+        now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
			
 
				+
			
 
				+        # 遍历每个 GPU
			
 
				+        for i in range(device_count):
			
 
				+            # 获取 GPU 句柄
			
 
				+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
			
 
				+
			
 
				+            # 获取 GPU 名称
			
 
				+            gpu_name = pynvml.nvmlDeviceGetName(handle)
			
 
				+
			
 
				+            # 获取显存信息
			
 
				+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
			
 
				+            total_memory = mem_info.total / (1024 * 1024)  # 转换为 MiB
			
 
				+            used_memory = mem_info.used / (1024 * 1024)   # 转换为 MiB
			
 
				+            free_memory = mem_info.free / (1024 * 1024)   # 转换为 MiB
			
 
				+
			
 
				+            info = f'  时间：{now}\n'
			
 
				+            info += f"  GPU信息 {i}: {gpu_name.decode('utf-8')}\n"
			
 
				+            info += f"    总显存: {total_memory:.2f} MiB\n"
			
 
				+            info += f"    已用显存: {used_memory:.2f} MiB\n"
			
 
				+            info += f"    剩余显存: {free_memory:.2f} MiB\n\n"
			
 
				+
			
 
				+            # 获取进程信息
			
 
				+            processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
			
 
				+            if processes:
			
 
				+                info += f"  GPU进程信息: {i}\n"
			
 
				+                for p in processes:
			
 
				+                    pid = p.pid
			
 
				+                    used_memory = p.usedGpuMemory / (1024 * 1024)
			
 
				+                    try:
			
 
				+                        # 获取进程的启动命令
			
 
				+                        proc = psutil.Process(pid)
			
 
				+                        cmdline = proc.cmdline()
			
 
				+                        info += f"    {' '.join(cmdline)[-17:-14]} {pid}: {used_memory:.2f} MiB\n"
			
 
				+                    except:
			
 
				+                        traceback.print_exc()
			
 
				+            print(info)
			
 
				+
			
 
				+        # 关闭 NVML
			
 
				+        pynvml.nvmlShutdown()
			
 
				+    except:
			
 
				+        traceback.print_exc()
			
 
				+        pass
			
 
				+
			
 
				+
			
 
				+def get_current_process_gpu_id():
			
 
				+    try:
			
 
				+        # 初始化 NVML
			
 
				+        pynvml.nvmlInit()
			
 
				+
			
 
				+        # 获取当前进程的 PID
			
 
				+        current_pid = os.getpid()
			
 
				+        # print(f"Current PID: {current_pid}")
			
 
				+
			
 
				+        # 获取 GPU 设备数量
			
 
				+        device_count = pynvml.nvmlDeviceGetCount()
			
 
				+
			
 
				+        # 遍历每个 GPU 设备
			
 
				+        for i in range(device_count):
			
 
				+            # 获取 GPU 句柄
			
 
				+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
			
 
				+
			
 
				+            # 获取运行在该 GPU 上的进程
			
 
				+            try:
			
 
				+                processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
			
 
				+            except pynvml.NVMLError:
			
 
				+                processes = []
			
 
				+
			
 
				+            # 查找当前进程
			
 
				+            for p in processes:
			
 
				+                if p.pid == current_pid:
			
 
				+                    print(f"Process {current_pid} is running on GPU {i}")
			
 
				+                    return i
			
 
				+
			
 
				+        print("Current process not found on any GPU")
			
 
				+        return None
			
 
				+    except:
			
 
				+        traceback.print_exc()
			
 
				+        return None
			
 
				+    finally:
			
 
				+        # 关闭 NVML
			
 
				+        pynvml.nvmlShutdown()
			
 
				+
			
 
				+
			
 
				+def register_all_fonts(font_dir):
			
 
				+    # 遍历字体目录
			
 
				+    for root, dirs, files in os.walk(font_dir):
			
 
				+        for file in files:
			
 
				+            # 检查文件扩展名是否为 TrueType 或 OpenType
			
 
				+            if file.endswith((".ttf", ".otf")):
			
 
				+                font_path = os.path.join(root, file)
			
 
				+                # 提取字体名称（去掉扩展名）
			
 
				+                font_name = os.path.splitext(file)[0]
			
 
				+                try:
			
 
				+                    # 注册字体
			
 
				+                    pdfmetrics.registerFont(TTFont(font_name, font_path))
			
 
				+                    print(f"Font registered: {font_name}")
			
 
				+                except Exception as e:
			
 
				+                    print(f"Failed to register font {font_name}: {e}")
			
 
				+
			
 
				+
			
 
				+def ascii85_decode(data):
			
 
				+    """
			
 
				+    手动实现 ASCII85 解码
			
 
				+    """
			
 
				+    decoded = b''
			
 
				+    i = 0
			
 
				+    while i < len(data):
			
 
				+        # ASCII85 编码以 '!' 开始，以 'z' 结束
			
 
				+        if data[i] == ord('z'):
			
 
				+            decoded += b'\0\0\0\0'
			
 
				+            i += 1
			
 
				+        else:
			
 
				+            # 取 5 个字符进行解码
			
 
				+            block = data[i:i+5]
			
 
				+            i += 5
			
 
				+            # 转换为整数值
			
 
				+            value = 0
			
 
				+            for c in block:
			
 
				+                if ord('!') <= c <= ord('u'):
			
 
				+                    value = value * 85 + (c - ord('!'))
			
 
				+                elif c == ord('z'):
			
 
				+                    value = 0
			
 
				+                else:
			
 
				+                    # 无效字符，跳过
			
 
				+                    continue
			
 
				+            # 转换为 4 个字节
			
 
				+            bytes_value = value.to_bytes(4, byteorder='big')
			
 
				+            decoded += bytes_value
			
 
				+    return decoded
			
 
				+
			
 
				+
			
 
				+def special_font_to_normal(text):
			
 
				+    """
			
 
				+    特殊中文转基本中文unicode
			
 
				+
			
 
				+    :return:
			
 
				+    """
			
 
				+    # print('type(extend_to_normal_dict)', type(extend_to_normal_dict), type(kangxi_to_normal_dict))
			
 
				+    extend_set = set(extend_to_normal_dict.keys())
			
 
				+    kangxi_set = set(kangxi_to_normal_dict.keys())
			
 
				+    text_list = list(text)
			
 
				+    for i, c in enumerate(text_list):
			
 
				+        if c in extend_set:
			
 
				+            text_list[i] = extend_to_normal_dict.get(c)
			
 
				+        elif c in kangxi_set:
			
 
				+            text_list[i] = kangxi_to_normal_dict.get(c)
			
 
				+    text = ''.join(text_list)
			
 
				+    return text
			
 
				+
			
 
				+
			
 
				+def image_resize_by_ratio(img, max_width=1800, max_height=2600):
			
 
				+    # 获取原图的宽度和高度
			
 
				+    width, height = img.size
			
 
				+    # print('width, height, max_width, max_height', width, height, max_width, max_height)
			
 
				+
			
 
				+    # 计算宽高比
			
 
				+    aspect_ratio = width / height
			
 
				+    # 判断哪条边超出最大值更多
			
 
				+    if width > max_width and height > max_height:
			
 
				+        # 计算宽度和高度超出最大值的比例
			
 
				+        width_exceed_ratio = width / max_width
			
 
				+        height_exceed_ratio = height / max_height
			
 
				+
			
 
				+        # 选择超出比例更大的边作为基准进行缩放
			
 
				+        if width_exceed_ratio > height_exceed_ratio:
			
 
				+            new_width = max_width
			
 
				+            new_height = int(new_width / aspect_ratio)
			
 
				+        else:
			
 
				+            new_height = max_height
			
 
				+            new_width = int(new_height * aspect_ratio)
			
 
				+        # print('new_width, new_height1', new_width, new_height)
			
 
				+    elif width > max_width:
			
 
				+        new_width = max_width
			
 
				+        new_height = int(new_width / aspect_ratio)
			
 
				+        # print('new_width, new_height2', new_width, new_height)
			
 
				+    elif height > max_height:
			
 
				+        new_height = max_height
			
 
				+        new_width = int(new_height * aspect_ratio)
			
 
				+        # print('new_width, new_height3', new_width, new_height)
			
 
				+    else:
			
 
				+        new_width, new_height = width, height
			
 
				+
			
 
				+    if new_width != width or new_height != height:
			
 
				+        img = img.resize((new_width, new_height), Image.LANCZOS)
			
 
				+    return img
			
 
				+
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				     # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
			
 
				     # print(slash_replace(strs))
			
@@ -2572,14 +2867,27 @@ if __name__ == "__main__":
 
				 
			
 
				     # print(parse_yaml())
			
 
				 
			
 
				-    print(get_ip_port())
			
 
				+    # print(get_ip_port())
			
 
				     # set_flask_global()
			
 
				-    print(get_all_ip())
			
 
				-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], "idc"))
			
 
				-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], "atc"))
			
 
				-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], "ocr"))
			
 
				-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], 'convert', 'MASTER'))
			
 
				+    # print(get_all_ip())
			
 
				+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], "idc"))
			
 
				+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], "atc"))
			
 
				+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], "ocr"))
			
 
				+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], 'convert', 'MASTER'))
			
 
				     # print(get_args_from_config(get_ip_port(), "http://127.0.0.1", "gunicorn_path"))
			
 
				     # print(get_intranet_ip())
			
 
				-    # _path = "C:/Users/Administrator/Downloads/3.png"
			
 
				-    # remove_red_seal(cv2.imread(_path))
			
 
				+
			
 
				+    # ps = glob(r'D:\Project\format_conversion_maxcompute\save_b_table_pdf\*.pdf')
			
 
				+    # save_dir = r'D:\Project\format_conversion_maxcompute\save_b_table_pdf'
			
 
				+    # index = 0
			
 
				+    # for p in ps:
			
 
				+    #     save_path = f'{save_dir}/e-{index}.pdf'
			
 
				+    #     page_no = int(re.split('\.|-', p)[1])
			
 
				+    #     extract_one_page_pdf(p, save_path, page_no)
			
 
				+    #     index += 1
			
 
				+
			
 
				+    # _ss = 'otr_interface:app'
			
 
				+    # print(_ss[-17:-14])
			
 
				+
			
 
				+    _ss = '仁和坪镇杨柳池村⼈居环境整治项⽬终⽌'
			
 
				+    print(special_font_to_normal(_ss))
			
--- a/monitor/watch_10_minutes_process.sh
+++ b/monitor/watch_10_minutes_process.sh
@@ -1,3 +1,6 @@
 
				 #!/bin/bash
			
 
				 
			
 
				 sed -n '/2024-05-29 17:30:00/,/2024-05-29 17:40:00/p' /convert.out | grep 'is_success' | wc -l
			
 
				+
			
 
				+
			
 
				+sed -n '/2025-06-11 12:50:00/,/2025-06-11 13:00:00/p' /convert.out | grep 'is_success: ' | awk -F '[\\[\\] ]+' '{file_type=$(NF-2); time=$NF; map[file_type] += time; count[file_type]++} END {for (key in map) print key, "-", map[key], "-", count[key], "-", map[key]/count[key]}'
			
--- a/ocr/ocr_interface.py
+++ b/ocr/ocr_interface.py
@@ -5,7 +5,7 @@ import multiprocessing as mp
 
				 import socket
			
 
				 import sys
			
 
				 import os
			
 
				-
			
 
				+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
			
 
				 from PIL import Image
			
 
				 
			
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
@@ -91,7 +91,10 @@ def picture2text(img_data, ocr_model, only_rec=0):
 
				         text_list = []
			
 
				         bbox_list = []
			
 
				         if only_rec:
			
 
				-            text_list = [results[0][0]]
			
 
				+            if results:
			
 
				+                text_list = [results[0][0]]
			
 
				+            else:
			
 
				+                text_list = []
			
 
				             bbox_list = []
			
 
				         else:
			
 
				             for line in results:
			
@@ -176,27 +179,6 @@ def test_ocr_model(from_remote=True):
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    test_ocr_model()
			
 
				-
			
 
				-#     src = """
			
 
				-# data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAASwAAAAeCAYAAACWuCNnAAAE3ElEQVR42u2dQWjUQBSGi4gnEURERKQgIiIiggdP4sWDiPTgvSAK4sFDEe8iIohHEW8iUsSLFBERQaSI9CBIERGRgpQiHtpuMkl2Pa/vH2fW2ZhssttkO8v+AyHTzCT53nT23zdvZpOJIAh2TEhqNpu7VaS+THiWWq3WHuFaQV7279bW1rarWP2y5eRnYv8ZI36l1PF2u71N52P11V7Ap+QaKbwPGo3GPudv8jOx/4wLv5xw2+ajKDrhY4cT5Z23xkKJhfOG5KeSJDlEfib2H/L7Y2ysHotRp62xsp8J4/Bc2ljyM7H/kN+LFEbhGxipjU3C86LYiRy7R34m9h/ye2s0DJZtEsaK0aujFO8ZdX72H/Kz/zup3W5vsfmgGRyJ4/hkug7cSBnn3nHrkZ+J/Yf8/8agkZoXRfyOPKYoaxnnRuqHbMti5EH9N1Q4DE91KXQYHnWNJf+IxJDY/uQfFj8qQzn1tCSglLpfm8v4d2y7omJ1E4bLvWa7WKQRBjGW/JsrVmx/8lfOf/HSlbatZPO4gVTeKhUjZxx6FdOR6XPSKa8s77hc8wIM1flYfcD4VrYW6kOpJf9etre97plxzZ78lX4wc/jlG+WZy9/nNYfGX4tYsf3JXwP/f2KCfd4GY+H6bUSwsrb13+t7jaLOWaMxBkZZEAT7scfisrKChelSaaxvmE6F2ouhd8M4vObyV5ny+PHNAH6U28VxPvLX0IHZ/uSvhb8jIlZQ5GbTUvmyzkdqCSeZ/CrGoQBBsAzHAFRWsNz7ZNWT63+Se10P4uCsLbOBO5ev0FjDb66p+c0Yu4sf5Xn8Aw5/uvjtP63v62wSf2XtwPYnf438mQJjhOsJ9nLya5xkXb8sDynPe8oSpqxyd02G68FJfjHPKytovDnwgzvNL2WfcQ/TGAuVjN8z+E1+MUmSXQP884fKX0Psiu1P/lr482NYcpJUPuMMB/VJ5keKjwYZEmYNPdPlcAVdYXLXavQVwzL8thHT/HV9WMHf+W3UBtaabBZ/laLF9id/1fyd9RO9YlcG5LVZYj/Vy9MZ9LgrZJi+xB7rNoqE0F3/Iar8Aq5pl1IbxXf5G0njcI3xmwU7/Zrmz0q+8feb2P7kHyY/Mk/hHloxgAvnBshc7wqPh8BUY1pgsrynokB7nujBjRSmnWUmAYwhmh8zEln82ugUf82ehebv4xvJK/4BvlHZ/uQfGn/noIlbvbRDMkwpOrGkn0Y9Z/MC4Hkxq6LydF3X4ys71HRmJzQ/HlNh+U15h1/KDwzL4+jH/fWFf1D3ne1P/mHw60pZImIj9WYt1LH0MvpeolM09CsTQC8rWJZfjProuKVTll8HBTP4vfmwk5/85C/PLwUPpcItu/zdDAFnigLcvQRlIx5WmetjOhZKnMWvV9wKv88BafKTn/wb4O9aHBqraayhQMCsKLheVmjq8LDMbMcS8piJ0NO1WBjn8Lvupm+J/OQn/wD8GC+aRz9Muk/8w0mIzPuq0pia1UaTn/zkHx9+85D45bxAmGdGzpOf/OQfY34peI7f+kDpfDQWv0XSY1tRYROQe0V+8pN/TPn5miPyk5/8I8PP1xyRn/zkHxl+viaI/OQnP/mrMpavOSI/+ck/SomvOSI/+ck/cqLF1xyRn/zk9yLxNUfkJz/5i9If9M5atZCy5xcAAAAASUVORK5CYII=
			
 
				-# """
			
 
				-#
			
 
				-#     image_data = src.split('data:image/png;base64,')[1]
			
 
				-#
			
 
				-#     # 解码 base64 字符串
			
 
				-#     image_bytes = base64.b64decode(image_data)
			
 
				-#
			
 
				-#     # 将字节转换为图像
			
 
				-#     # image = Image.open(io.BytesIO(image_bytes))
			
 
				-#
			
 
				-#     # image.show('img')
			
 
				-#
			
 
				-#     # with open(r'C:\Users\Administrator\Desktop\test_image\error16.jpg', 'rb') as f:
			
 
				-#     #     image_bytes = f.read()
			
 
				-#
			
 
				-#     image = bytes2np(image_bytes)
			
 
				-#
			
 
				-#     cv2.imshow('img', image)
			
 
				-#     cv2.imwrite('./1.png', image)
			
 
				-#     cv2.waitKey(0)
			
 
				+    # test_ocr_model()
			
 
				+
			
 
				+    app.run(host='127.0.0.1', port=17000, debug=False)
			
--- a/ocr/ppocr/data/__init__.py
+++ b/ocr/ppocr/data/__init__.py
@@ -25,6 +25,9 @@ import signal
 
				 import random
			
 
				 
			
 
				 __dir__ = os.path.dirname(os.path.abspath(__file__))
			
 
				+
			
 
				+from format_convert.utils import get_platform
			
 
				+
			
 
				 sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
			
 
				 
			
 
				 import copy
			
@@ -49,8 +52,9 @@ def term_mp(sig_num, frame):
 
				     os.killpg(pgid, signal.SIGKILL)
			
 
				 
			
 
				 
			
 
				-signal.signal(signal.SIGINT, term_mp)
			
 
				-signal.signal(signal.SIGTERM, term_mp)
			
 
				+if get_platform() != 'Windows':
			
 
				+    signal.signal(signal.SIGINT, term_mp)
			
 
				+    signal.signal(signal.SIGTERM, term_mp)
			
 
				 
			
 
				 
			
 
				 def build_dataloader(config, mode, device, logger, seed=None):
			
--- a/ocr/test_lock.py
+++ b/ocr/test_lock.py
@@ -0,0 +1,39 @@
 
				+import multiprocessing
			
 
				+import os
			
 
				+import sys
			
 
				+import time
			
 
				+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + '/../')
			
 
				+from format_convert.utils import file_lock
			
 
				+
			
 
				+
			
 
				+def run(a):
			
 
				+    while True:
			
 
				+        try:
			
 
				+            time2 = time.time()
			
 
				+            lock_file_sub = 'ocr'
			
 
				+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
			
 
				+            f = file_lock(lock_file)
			
 
				+            print(os.getpid(),"get file_lock " + lock_file + " time ", time.time()-time2)
			
 
				+            time2 = time.time()
			
 
				+            time.sleep(2)
			
 
				+            raise
			
 
				+            print(os.getpid(), "sleep", time.time()-time2)
			
 
				+
			
 
				+
			
 
				+        except Exception:
			
 
				+            print('RuntimeError')
			
 
				+        finally:
			
 
				+            f.close()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # 要处理的数据
			
 
				+    data = [1, 2, 3]
			
 
				+
			
 
				+    # 创建进程池，指定进程数为 CPU 核心数
			
 
				+    with multiprocessing.Pool(processes=3) as pool:
			
 
				+        # 使用 map 方法分配任务并获取结果
			
 
				+        results = pool.map(run, data)
			
 
				+
			
 
				+    # 输出结果
			
 
				+    # print(results)
			
--- a/ocr/tools/infer/predict_det_pytorch.py
+++ b/ocr/tools/infer/predict_det_pytorch.py
@@ -19,7 +19,8 @@ import sys
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../../")
			
 
				 import requests
			
 
				 from format_convert import _global
			
 
				-from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock
			
 
				+from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock, \
			
 
				+    get_gpu_memory_usage, get_current_process_gpu_id
			
 
				 
			
 
				 os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
			
 
				 import cv2
			
@@ -120,6 +121,11 @@ class TextDetector(object):
 
				         self.predictor.to(self.device)
			
 
				         self.predictor.eval()
			
 
				 
			
 
				+        if str(self.device) != 'cpu':
			
 
				+            self.gpu_id = get_current_process_gpu_id()
			
 
				+        else:
			
 
				+            self.gpu_id = None
			
 
				+
			
 
				         # self.predictor, self.input_tensor, self.output_tensors = utility.create_predictor(
			
 
				         #     args, 'det', logger)  # paddle.jit.load(args.det_model_dir)
			
 
				         # self.predictor.eval()
			
@@ -189,55 +195,44 @@ class TextDetector(object):
 
				         shape_list = np.expand_dims(shape_list, axis=0)
			
 
				         img = img.copy()
			
 
				         starttime = time.time()
			
 
				-
			
 
				+        tensor = torch.from_numpy(img).float()
			
 
				         # self.input_tensor.copy_from_cpu(img)
			
 
				-        img = torch.from_numpy(img).float()
			
 
				-        img = img.to(self.device)
			
 
				-        try:
			
 
				+        # if ori_im.shape[0] > 1024 and ori_im.shape[1] > 1024 and get_platform() != "Windows" and not MAX_COMPUTE:
			
 
				+        if get_platform() != "Windows" and not MAX_COMPUTE and self.gpu_id is not None:
			
 
				             # 加锁，防止太多大图片同时预测，爆显存
			
 
				-            if ori_im.shape[0] > 1024 and ori_im.shape[1] > 1024 and get_platform() != "Windows" and not MAX_COMPUTE:
			
 
				+            time2 = time.time()
			
 
				+            lock_file_sub = f'ocr_{self.gpu_id}'
			
 
				+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
			
 
				+            f = file_lock(lock_file)
			
 
				+            log("det get file_lock " + lock_file + " time " + str(time.time()-time2))
			
 
				+
			
 
				+            try:
			
 
				                 time2 = time.time()
			
 
				-                lock_file_sub = 'ocr'
			
 
				-                lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
			
 
				-                f = file_lock(lock_file)
			
 
				-                log("get file_lock " + lock_file_sub + " time " + str(time.time()-time2))
			
 
				+                if str(self.device) != 'cpu':
			
 
				+                    torch.cuda.empty_cache()
			
 
				+                tensor = tensor.to(self.device)
			
 
				                 with torch.no_grad():
			
 
				-                    out = self.predictor(img)
			
 
				+                    out = self.predictor(tensor)
			
 
				+                log("get file_lock run det" + " time " + str(time.time()-time2))
			
 
				+            except RuntimeError:
			
 
				+                log("ocr/tools/infer/predict_det.py predict.run error! maybe no gpu memory!")
			
 
				+                log("det predictor shrink memory! ori_im.shape " + str(ori_im.shape))
			
 
				+                get_gpu_memory_usage()
			
 
				+                raise RuntimeError
			
 
				+            finally:
			
 
				                 f.close()
			
 
				-            else:
			
 
				-                with torch.no_grad():
			
 
				-                    out = self.predictor(img)
			
 
				-        except RuntimeError:
			
 
				-            log("ocr/tools/infer/predict_det.py predict.run error! maybe no gpu memory!")
			
 
				-            log("predictor shrink memory!")
			
 
				-            # self.predictor.clear_intermediate_tensor()
			
 
				-            # self.predictor.try_shrink_memory()
			
 
				-            if str(self.device)!='cpu':
			
 
				-                torch.cuda.empty_cache()
			
 
				-                gc.collect()
			
 
				-            raise RuntimeError
			
 
				-
			
 
				-        # outputs = []
			
 
				-        # for output_tensor in self.output_tensors:
			
 
				-        #     output = output_tensor.copy_to_cpu()
			
 
				-        #     outputs.append(output)
			
 
				-        out = out.cpu().numpy()
			
 
				+                if str(self.device) != 'cpu':
			
 
				+                    torch.cuda.empty_cache()
			
 
				+                # gc.collect()
			
 
				+        else:
			
 
				+            tensor = tensor.to(self.device)
			
 
				+            with torch.no_grad():
			
 
				+                out = self.predictor(tensor)
			
 
				 
			
 
				+        out = out.cpu().numpy()
			
 
				         preds = {}
			
 
				         preds['maps'] = out
			
 
				 
			
 
				-        # if self.det_algorithm == "EAST":
			
 
				-        #     preds['f_geo'] = outputs[0]
			
 
				-        #     preds['f_score'] = outputs[1]
			
 
				-        # elif self.det_algorithm == 'SAST':
			
 
				-        #     preds['f_border'] = outputs[0]
			
 
				-        #     preds['f_score'] = outputs[1]
			
 
				-        #     preds['f_tco'] = outputs[2]
			
 
				-        #     preds['f_tvo'] = outputs[3]
			
 
				-        # elif self.det_algorithm == 'DB':
			
 
				-        #     preds['maps'] = outputs[0]
			
 
				-        # else:
			
 
				-        #     raise NotImplementedError
			
 
				         post_result = self.postprocess_op(preds, shape_list)
			
 
				         dt_boxes = post_result[0]['points']
			
 
				         if self.det_algorithm == "SAST" and self.det_sast_polygon:
			
@@ -246,17 +241,6 @@ class TextDetector(object):
 
				             dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
			
 
				         elapse = time.time() - starttime
			
 
				 
			
 
				-        # 释放内存
			
 
				-        # print("TextDetector", self.predictor)
			
 
				-        # if TextDetector.shrink_memory_count % 100 == 0:
			
 
				-            # print("TextDetector shrink memory")
			
 
				-        # self.predictor.clear_intermediate_tensor()
			
 
				-        # self.predictor.try_shrink_memory()
			
 
				-        # TextDetector.shrink_memory_count += 1
			
 
				-        if str(self.device) != 'cpu':
			
 
				-            torch.cuda.empty_cache()
			
 
				-            # gc.collect()
			
 
				-
			
 
				         return dt_boxes, elapse
			
 
				 
			
 
				 
			
--- a/ocr/tools/infer/predict_rec_pytorch.py
+++ b/ocr/tools/infer/predict_rec_pytorch.py
@@ -37,8 +37,9 @@ import ocr.tools.infer.utility as utility
 
				 from ocr.ppocr.postprocess import build_post_process
			
 
				 from ocr.ppocr.utils.logging import get_logger
			
 
				 from ocr.ppocr.utils.utility import get_image_file_list, check_and_read_gif
			
 
				-
			
 
				-from format_convert.utils import judge_error_code, log, namespace_to_dict,get_platform
			
 
				+from config.max_compute_config import MAX_COMPUTE
			
 
				+from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock, \
			
 
				+    get_gpu_memory_usage, get_current_process_gpu_id
			
 
				 from format_convert import _global
			
 
				 
			
 
				 import torch
			
@@ -56,6 +57,8 @@ class TextRecognizer(object):
 
				         self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
			
 
				         self.character_type = args.rec_char_type
			
 
				         self.rec_batch_num = args.rec_batch_num
			
 
				+        self.rec_batch_num = 16
			
 
				+        print('self.rec_batch_num', self.rec_batch_num)
			
 
				         self.rec_algorithm = args.rec_algorithm
			
 
				         postprocess_params = {
			
 
				             'name': 'CTCLabelDecode',
			
@@ -64,23 +67,7 @@ class TextRecognizer(object):
 
				             # "use_space_char": args.use_space_char
			
 
				             "use_space_char": False
			
 
				         }
			
 
				-        # if self.rec_algorithm == "SRN":
			
 
				-        #     postprocess_params = {
			
 
				-        #         'name': 'SRNLabelDecode',
			
 
				-        #         "character_type": args.rec_char_type,
			
 
				-        #         "character_dict_path": args.rec_char_dict_path,
			
 
				-        #         "use_space_char": args.use_space_char
			
 
				-        #     }
			
 
				-        # elif self.rec_algorithm == "RARE":
			
 
				-        #     postprocess_params = {
			
 
				-        #         'name': 'AttnLabelDecode',
			
 
				-        #         "character_type": args.rec_char_type,
			
 
				-        #         "character_dict_path": args.rec_char_dict_path,
			
 
				-        #         "use_space_char": args.use_space_char
			
 
				-        #     }
			
 
				         self.postprocess_op = build_post_process(postprocess_params)
			
 
				-        # self.predictor, self.input_tensor, self.output_tensors = \
			
 
				-        #     utility.create_predictor(args, 'rec', logger)
			
 
				 
			
 
				         rec_model_path = args.rec_model_dir
			
 
				         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
			
@@ -100,19 +87,22 @@ class TextRecognizer(object):
 
				         self.predictor.to(self.device)
			
 
				         self.predictor.eval()
			
 
				 
			
 
				+        if str(self.device) != 'cpu':
			
 
				+            self.gpu_id = get_current_process_gpu_id()
			
 
				+        else:
			
 
				+            self.gpu_id = None
			
 
				+
			
 
				     def resize_norm_img(self, img, max_wh_ratio):
			
 
				         h, w = img.shape[:2]
			
 
				         imgC, imgH, imgW = self.rec_image_shape
			
 
				         assert imgC == img.shape[2]
			
 
				         # print('max_wh_ratio', max_wh_ratio)
			
 
				+        # max_wh_ratio h是w的10倍，直接返回
			
 
				         if max_wh_ratio < 0.1:
			
 
				-            # if h > imgW:
			
 
				-            #     resized_image = cv2.resize(img, (w, imgW))
			
 
				-            # else:
			
 
				-            #     resized_image = img
			
 
				-
			
 
				-            # max_wh_ratio h是w的10倍，直接跳过
			
 
				-            resized_w = None
			
 
				+            # log('max_wh_ratio < 0.1', )
			
 
				+            resized_image = img.astype('float32')
			
 
				+            resized_image = resized_image.transpose((2, 0, 1)) / 255
			
 
				+            return resized_image
			
 
				         else:
			
 
				             if self.character_type == "ch":
			
 
				                 imgW = int((32 * max_wh_ratio))
			
@@ -138,186 +128,211 @@ class TextRecognizer(object):
 
				             padding_im[:, :, 0:resized_w] = resized_image
			
 
				         return padding_im
			
 
				 
			
 
				-    def resize_norm_img_srn(self, img, image_shape):
			
 
				-        imgC, imgH, imgW = image_shape
			
 
				-
			
 
				-        img_black = np.zeros((imgH, imgW))
			
 
				-        im_hei = img.shape[0]
			
 
				-        im_wid = img.shape[1]
			
 
				-
			
 
				-        if im_wid <= im_hei * 1:
			
 
				-            img_new = cv2.resize(img, (imgH * 1, imgH))
			
 
				-        elif im_wid <= im_hei * 2:
			
 
				-            img_new = cv2.resize(img, (imgH * 2, imgH))
			
 
				-        elif im_wid <= im_hei * 3:
			
 
				-            img_new = cv2.resize(img, (imgH * 3, imgH))
			
 
				+    def predict(self, norm_img_batch):
			
 
				+        tensor = torch.from_numpy(norm_img_batch).float()
			
 
				+        # if norm_img.shape[3] >= 100 and get_platform() != "Windows" and not MAX_COMPUTE:
			
 
				+        if get_platform() != "Windows" and not MAX_COMPUTE:
			
 
				+            # 加锁
			
 
				+            time2 = time.time()
			
 
				+            lock_file_sub = 'ocr'
			
 
				+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
			
 
				+            f = file_lock(lock_file)
			
 
				+            log("rec get file_lock " + lock_file + " time " + str(time.time()-time2))
			
 
				+            try:
			
 
				+                time2 = time.time()
			
 
				+                if str(self.device) != 'cpu':
			
 
				+                    torch.cuda.empty_cache()
			
 
				+                tensor = tensor.to(self.device)
			
 
				+                with torch.no_grad():
			
 
				+                    out = self.predictor(tensor)
			
 
				+                log("get file_lock run rec" + " time " + str(time.time()-time2))
			
 
				+            except RuntimeError:
			
 
				+                log("ocr/tools/infer/predict_rec.py predict.run error! maybe no gpu memory!")
			
 
				+                log("rec predictor shrink memory! ori_im.shape " + str(norm_img_batch.shape))
			
 
				+                get_gpu_memory_usage()
			
 
				+                raise RuntimeError
			
 
				+            finally:
			
 
				+                f.close()
			
 
				+                if str(self.device) != 'cpu':
			
 
				+                    torch.cuda.empty_cache()
			
 
				+                gc.collect()
			
 
				         else:
			
 
				-            img_new = cv2.resize(img, (imgW, imgH))
			
 
				-
			
 
				-        img_np = np.asarray(img_new)
			
 
				-        img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
			
 
				-        img_black[:, 0:img_np.shape[1]] = img_np
			
 
				-        img_black = img_black[:, :, np.newaxis]
			
 
				-
			
 
				-        row, col, c = img_black.shape
			
 
				-        c = 1
			
 
				-
			
 
				-        return np.reshape(img_black, (c, row, col)).astype(np.float32)
			
 
				-
			
 
				-    def srn_other_inputs(self, image_shape, num_heads, max_text_length):
			
 
				-
			
 
				-        imgC, imgH, imgW = image_shape
			
 
				-        feature_dim = int((imgH / 8) * (imgW / 8))
			
 
				-
			
 
				-        encoder_word_pos = np.array(range(0, feature_dim)).reshape(
			
 
				-            (feature_dim, 1)).astype('int64')
			
 
				-        gsrm_word_pos = np.array(range(0, max_text_length)).reshape(
			
 
				-            (max_text_length, 1)).astype('int64')
			
 
				-
			
 
				-        gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
			
 
				-        gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
			
 
				-            [-1, 1, max_text_length, max_text_length])
			
 
				-        gsrm_slf_attn_bias1 = np.tile(
			
 
				-            gsrm_slf_attn_bias1,
			
 
				-            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
			
 
				-
			
 
				-        gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
			
 
				-            [-1, 1, max_text_length, max_text_length])
			
 
				-        gsrm_slf_attn_bias2 = np.tile(
			
 
				-            gsrm_slf_attn_bias2,
			
 
				-            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
			
 
				-
			
 
				-        encoder_word_pos = encoder_word_pos[np.newaxis, :]
			
 
				-        gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
			
 
				-
			
 
				-        return [
			
 
				-            encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
			
 
				-            gsrm_slf_attn_bias2
			
 
				-        ]
			
 
				-
			
 
				-    def process_image_srn(self, img, image_shape, num_heads, max_text_length):
			
 
				-        norm_img = self.resize_norm_img_srn(img, image_shape)
			
 
				-        norm_img = norm_img[np.newaxis, :]
			
 
				-
			
 
				-        [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
			
 
				-            self.srn_other_inputs(image_shape, num_heads, max_text_length)
			
 
				-
			
 
				-        gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
			
 
				-        gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
			
 
				-        encoder_word_pos = encoder_word_pos.astype(np.int64)
			
 
				-        gsrm_word_pos = gsrm_word_pos.astype(np.int64)
			
 
				-
			
 
				-        return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
			
 
				-                gsrm_slf_attn_bias2)
			
 
				+            tensor = tensor.to(self.device)
			
 
				+            with torch.no_grad():
			
 
				+                out = self.predictor(tensor)
			
 
				+        # logging.info("ocr model predict time - rec" + str(time.time()-start_time))
			
 
				+        out = out.cpu().numpy()
			
 
				+        preds = out
			
 
				+        return preds
			
 
				+
			
 
				+    def predict_batch(self, batch_list):
			
 
				+        batch_out_list = []
			
 
				+        if get_platform() != "Windows" and not MAX_COMPUTE and self.gpu_id is not None:
			
 
				+            # 加锁
			
 
				+            time2 = time.time()
			
 
				+            lock_file_sub = f'ocr_{self.gpu_id}'
			
 
				+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
			
 
				+            f = file_lock(lock_file)
			
 
				+            log("rec get file_lock " + lock_file + " time " + str(time.time()-time2))
			
 
				+            try:
			
 
				+                time2 = time.time()
			
 
				+                if str(self.device) != 'cpu':
			
 
				+                    torch.cuda.empty_cache()
			
 
				+                for sub_batch_list in batch_list:
			
 
				+                    sub_batch_out = []
			
 
				+                    for tensor in sub_batch_list:
			
 
				+                        with torch.no_grad():
			
 
				+                            out = self.predictor(tensor)
			
 
				+                            out = out.cpu().numpy()
			
 
				+                        sub_batch_out.append(out)
			
 
				+                    # sub_batch_out = np.concatenate(sub_batch_out, axis=0)
			
 
				+                    batch_out_list.append(sub_batch_out)
			
 
				+                log("get file_lock run rec" + " time " + str(time.time()-time2))
			
 
				+
			
 
				+            except RuntimeError:
			
 
				+                log("ocr/tools/infer/predict_rec.py predict.run error! maybe no gpu memory!")
			
 
				+                log("rec predictor shrink memory! ori_im.shape " + str(tensor.shape))
			
 
				+                get_gpu_memory_usage()
			
 
				+                raise RuntimeError
			
 
				+            finally:
			
 
				+                f.close()
			
 
				+                if str(self.device) != 'cpu':
			
 
				+                    torch.cuda.empty_cache()
			
 
				+        else:
			
 
				+            for sub_batch_list in batch_list:
			
 
				+                sub_batch_out = []
			
 
				+                for tensor in sub_batch_list:
			
 
				+                    # print('tensor.shape', tensor.shape)
			
 
				+                    with torch.no_grad():
			
 
				+                        out = self.predictor(tensor)
			
 
				+                        out = out.cpu().numpy()
			
 
				+                    # print('out.shape', out.shape)
			
 
				+                    sub_batch_out.append(out)
			
 
				+                # sub_batch_out = np.concatenate(sub_batch_out, axis=0)
			
 
				+                batch_out_list.append(sub_batch_out)
			
 
				+
			
 
				+        # 转为numpy
			
 
				+        for bi, sub_batch_out in enumerate(batch_out_list):
			
 
				+            batch_out_list[bi] = np.concatenate(sub_batch_out, axis=0)
			
 
				+        return batch_out_list
			
 
				 
			
 
				     def __call__(self, img_list):
			
 
				+        start_time = time.time()
			
 
				+        # print('into TextRecognizer __call__')
			
 
				         img_num = len(img_list)
			
 
				-        # Calculate the aspect ratio of all text bars
			
 
				+
			
 
				+        # 过滤图片比例异常的
			
 
				+        # print('rec len(img_list)', len(img_list))
			
 
				+        temp_list = []
			
 
				+        for img in img_list:
			
 
				+            if img.shape[0] == 0 or img.shape[1] == 0 \
			
 
				+                    or img.shape[0] >= 10000 or img.shape[1] >= 10000 \
			
 
				+                    or img.shape[1] / img.shape[0] <= 0.5 \
			
 
				+                    or img.shape[1] / img.shape[0] >= 100:
			
 
				+                # print('rec img.shape[1] / img.shape[0] <= 0.5', img.shape)
			
 
				+                continue
			
 
				+            temp_list.append(img)
			
 
				+        if not temp_list:
			
 
				+            return None, 0
			
 
				+        img_list = temp_list
			
 
				+
			
 
				+        # 按比例排序
			
 
				         width_list = []
			
 
				         i = 0
			
 
				         for img in img_list:
			
 
				-            # cv2.imwrite('D:/myProject/format_conversion_maxcompute/ocr/test/'+str(i)+'.jpg',img)
			
 
				-            # i+=1
			
 
				-            # cv2.imshow('img', img)
			
 
				-            # cv2.waitKey(1000)
			
 
				             width_list.append(img.shape[1] / float(img.shape[0]))
			
 
				         # Sorting can speed up the recognition process
			
 
				         indices = np.argsort(np.array(width_list))
			
 
				 
			
 
				+        # 分批预测
			
 
				         # rec_res = []
			
 
				         rec_res = [['', 0.0]] * img_num
			
 
				         batch_num = self.rec_batch_num
			
 
				         elapse = 0
			
 
				+        batch_list = []
			
 
				         for beg_img_no in range(0, img_num, batch_num):
			
 
				             end_img_no = min(img_num, beg_img_no + batch_num)
			
 
				             norm_img_batch = []
			
 
				             max_wh_ratio = 0
			
 
				+            # 取这个batch中比例最大的
			
 
				             for ino in range(beg_img_no, end_img_no):
			
 
				                 # h, w = img_list[ino].shape[0:2]
			
 
				                 h, w = img_list[indices[ino]].shape[0:2]
			
 
				                 wh_ratio = w * 1.0 / h
			
 
				                 max_wh_ratio = max(max_wh_ratio, wh_ratio)
			
 
				-            # print('max_wh_ratio',max_wh_ratio)
			
 
				+            # print('max_wh_ratio', max_wh_ratio)
			
 
				+
			
 
				+            # resize image
			
 
				             for ino in range(beg_img_no, end_img_no):
			
 
				-                if self.rec_algorithm != "SRN":
			
 
				-                    # print('max_wh_ratio', max_wh_ratio)
			
 
				-                    norm_img = self.resize_norm_img(img_list[indices[ino]],
			
 
				-                                                    max_wh_ratio)
			
 
				-                    # cv2.imshow('img', norm_img.transpose(1,2,0))
			
 
				-                    # cv2.waitKey(1000)
			
 
				-                    norm_img = norm_img[np.newaxis, :]
			
 
				-                    norm_img_batch.append(norm_img)
			
 
				-                else:
			
 
				-                    # norm_img = self.process_image_srn(
			
 
				-                    #     img_list[indices[ino]], self.rec_image_shape, 8, 25)
			
 
				-                    # encoder_word_pos_list = []
			
 
				-                    # gsrm_word_pos_list = []
			
 
				-                    # gsrm_slf_attn_bias1_list = []
			
 
				-                    # gsrm_slf_attn_bias2_list = []
			
 
				-                    # encoder_word_pos_list.append(norm_img[1])
			
 
				-                    # gsrm_word_pos_list.append(norm_img[2])
			
 
				-                    # gsrm_slf_attn_bias1_list.append(norm_img[3])
			
 
				-                    # gsrm_slf_attn_bias2_list.append(norm_img[4])
			
 
				-                    # norm_img_batch.append(norm_img[0])
			
 
				-                    pass
			
 
				+                # print('img_list[indices[ino]].shape', img_list[indices[ino]].shape)
			
 
				+                norm_img = self.resize_norm_img(img_list[indices[ino]],
			
 
				+                                                max_wh_ratio)
			
 
				+                # print('norm_img.shape', norm_img.shape)
			
 
				+                norm_img = norm_img[np.newaxis, :]
			
 
				+                norm_img_batch.append(norm_img)
			
 
				+
			
 
				             norm_img_batch = np.concatenate(norm_img_batch)
			
 
				             norm_img_batch = norm_img_batch.copy()
			
 
				 
			
 
				-            if self.rec_algorithm == "SRN":
			
 
				-                # starttime = time.time()
			
 
				-                # encoder_word_pos_list = np.concatenate(encoder_word_pos_list)
			
 
				-                # gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list)
			
 
				-                # gsrm_slf_attn_bias1_list = np.concatenate(
			
 
				-                #     gsrm_slf_attn_bias1_list)
			
 
				-                # gsrm_slf_attn_bias2_list = np.concatenate(
			
 
				-                #     gsrm_slf_attn_bias2_list)
			
 
				-                #
			
 
				-                # inputs = [
			
 
				-                #     norm_img_batch,
			
 
				-                #     encoder_word_pos_list,
			
 
				-                #     gsrm_word_pos_list,
			
 
				-                #     gsrm_slf_attn_bias1_list,
			
 
				-                #     gsrm_slf_attn_bias2_list,
			
 
				-                # ]
			
 
				-                # input_names = self.predictor.get_input_names()
			
 
				-                # for i in range(len(input_names)):
			
 
				-                #     input_tensor = self.predictor.get_input_handle(input_names[
			
 
				-                #         i])
			
 
				-                #     input_tensor.copy_from_cpu(inputs[i])
			
 
				-                # self.predictor.run()
			
 
				-                # outputs = []
			
 
				-                # for output_tensor in self.output_tensors:
			
 
				-                #     output = output_tensor.copy_to_cpu()
			
 
				-                #     outputs.append(output)
			
 
				-                # preds = {"predict": outputs[2]}
			
 
				-                pass
			
 
				+            # 预测
			
 
				+            # starttime = time.time()
			
 
				+            # # 当图片很长时，降低batch，防止爆内存
			
 
				+            # # print('norm_img_batch.shape', norm_img_batch.shape)
			
 
				+            # preds = []
			
 
				+            # if norm_img_batch.shape[-1] >= 400:
			
 
				+            #     if norm_img_batch.shape[-1] <= 1000:
			
 
				+            #         mini_batch_size = 4
			
 
				+            #     elif norm_img_batch.shape[-1] <= 3000:
			
 
				+            #         mini_batch_size = 2
			
 
				+            #     else:
			
 
				+            #         mini_batch_size = 1
			
 
				+            #     for bi in range(0, norm_img_batch.shape[0], mini_batch_size):
			
 
				+            #         sub_batch = norm_img_batch[bi:bi+mini_batch_size]
			
 
				+            #         sub_preds = self.predict(sub_batch)
			
 
				+            #         preds.append(sub_preds)
			
 
				+            #         # print('type(sub_preds), sub_preds.shape', type(sub_preds), sub_preds.shape)
			
 
				+            #     preds = np.concatenate(preds, axis=0)
			
 
				+            # else:
			
 
				+            #     preds = self.predict(norm_img_batch)
			
 
				+            # # print('type(preds), preds.shape', type(preds), preds.shape)
			
 
				+            #
			
 
				+            # # 后处理
			
 
				+            # rec_result = self.postprocess_op(preds)
			
 
				+            # for rno in range(len(rec_result)):
			
 
				+            #     rec_res[indices[beg_img_no + rno]] = rec_result[rno]
			
 
				+            # elapse += time.time() - starttime
			
 
				+
			
 
				+            # 根据长度，动态batch
			
 
				+            if norm_img_batch.shape[-1] >= 400:
			
 
				+                if norm_img_batch.shape[-1] <= 1000:
			
 
				+                    mini_batch_size = 4
			
 
				+                elif norm_img_batch.shape[-1] <= 3000:
			
 
				+                    mini_batch_size = 2
			
 
				+                else:
			
 
				+                    mini_batch_size = 1
			
 
				+                sub_batch_list = []
			
 
				+                for bi in range(0, norm_img_batch.shape[0], mini_batch_size):
			
 
				+                    sub_batch = norm_img_batch[bi:bi+mini_batch_size]
			
 
				+                    tensor = torch.from_numpy(sub_batch).float()
			
 
				+                    tensor = tensor.to(self.device)
			
 
				+                    sub_batch_list.append(tensor)
			
 
				             else:
			
 
				-                starttime = time.time()
			
 
				-
			
 
				                 tensor = torch.from_numpy(norm_img_batch).float()
			
 
				-                start_time = time.time()
			
 
				                 tensor = tensor.to(self.device)
			
 
				-                with torch.no_grad():
			
 
				-                    out = self.predictor(tensor)
			
 
				-                logging.info("ocr model predict time - rec" + str(time.time()-start_time))
			
 
				-                out = out.cpu().numpy()
			
 
				-                preds = out
			
 
				+                sub_batch_list = [tensor]
			
 
				 
			
 
				-            # print("tools/infer/predict_rec preds", preds)
			
 
				-            rec_result = self.postprocess_op(preds)
			
 
				-            for rno in range(len(rec_result)):
			
 
				-                # print("predict_rec", img_num, batch_num, beg_img_no,
			
 
				-                #       indices[beg_img_no + rno], len(rec_res))
			
 
				-                rec_res[indices[beg_img_no + rno]] = rec_result[rno]
			
 
				-            elapse += time.time() - starttime
			
 
				-            # 释放内存
			
 
				-            # self.predictor.clear_intermediate_tensor()
			
 
				-            # self.predictor.try_shrink_memory()
			
 
				-
			
 
				-            # gc.collect()
			
 
				-            if str(self.device)!='cpu':
			
 
				-                torch.cuda.empty_cache()
			
 
				-            #     gc.collect()
			
 
				+            batch_list.append(sub_batch_list)
			
 
				+
			
 
				+        # 预测
			
 
				+        batch_out_list = self.predict_batch(batch_list)
			
 
				+
			
 
				+        # 后处理
			
 
				+        for bi, out in enumerate(batch_out_list):
			
 
				+            begin_img_no = bi * batch_num
			
 
				+            rec_result = self.postprocess_op(out)
			
 
				+            for ri in range(len(rec_result)):
			
 
				+                rec_res[indices[begin_img_no + ri]] = rec_result[ri]
			
 
				+        elapse += time.time() - start_time
			
 
				         return rec_res, elapse
			
 
				 
			
 
				 
			
--- a/ocr/tools/infer/predict_system.py
+++ b/ocr/tools/infer/predict_system.py
@@ -26,17 +26,19 @@ import copy
 
				 import numpy as np
			
 
				 import time
			
 
				 from PIL import Image
			
 
				+
			
 
				 os.environ['FLAGS_eager_delete_tensor_gb'] = '0'
			
 
				 import utility as utility
			
 
				 # import ocr.tools.infer.predict_rec as predict_rec
			
 
				-import ocr.tools.infer.predict_rec_pytorch as predict_rec # pytorch rec model
			
 
				+import ocr.tools.infer.predict_rec_pytorch as predict_rec  # pytorch rec model
			
 
				 # import ocr.tools.infer.predict_det as predict_det
			
 
				-import ocr.tools.infer.predict_det_pytorch as predict_det # pytorch det model
			
 
				+import ocr.tools.infer.predict_det_pytorch as predict_det  # pytorch det model
			
 
				 import ocr.tools.infer.predict_cls as predict_cls
			
 
				 from ocr.ppocr.utils.utility import get_image_file_list, check_and_read_gif
			
 
				 from ocr.ppocr.utils.logging import get_logger
			
 
				 from ocr.tools.infer.utility import draw_ocr_box_txt
			
 
				-from format_convert.utils import has_intersection
			
 
				+from format_convert.utils import has_intersection, log
			
 
				+from format_convert import _global
			
 
				 
			
 
				 logger = get_logger()
			
 
				 
			
@@ -61,27 +63,36 @@ class TextSystem(object):
 
				         points[:, 0] = points[:, 0] - left
			
 
				         points[:, 1] = points[:, 1] - top
			
 
				         '''
			
 
				-        img_crop_width = int(
			
 
				-            max(
			
 
				-                np.linalg.norm(points[0] - points[1]),
			
 
				-                np.linalg.norm(points[2] - points[3])))
			
 
				-        img_crop_height = int(
			
 
				-            max(
			
 
				-                np.linalg.norm(points[0] - points[3]),
			
 
				-                np.linalg.norm(points[1] - points[2])))
			
 
				-        pts_std = np.float32([[0, 0], [img_crop_width, 0],
			
 
				-                              [img_crop_width, img_crop_height],
			
 
				-                              [0, img_crop_height]])
			
 
				-        M = cv2.getPerspectiveTransform(points, pts_std)
			
 
				-        dst_img = cv2.warpPerspective(
			
 
				-            img,
			
 
				-            M, (img_crop_width, img_crop_height),
			
 
				-            borderMode=cv2.BORDER_REPLICATE,
			
 
				-            flags=cv2.INTER_CUBIC)
			
 
				-        dst_img_height, dst_img_width = dst_img.shape[0:2]
			
 
				-        # if dst_img_height * 1.0 / dst_img_width >= 1.5:
			
 
				-        if dst_img_height * 1.0 / dst_img_width >= 2.0:
			
 
				-            dst_img = np.rot90(dst_img)
			
 
				+        # img_crop_width = int(
			
 
				+        #     max(
			
 
				+        #         np.linalg.norm(points[0] - points[1]),
			
 
				+        #         np.linalg.norm(points[2] - points[3])))
			
 
				+        # img_crop_height = int(
			
 
				+        #     max(
			
 
				+        #         np.linalg.norm(points[0] - points[3]),
			
 
				+        #         np.linalg.norm(points[1] - points[2])))
			
 
				+        # pts_std = np.float32([[0, 0], [img_crop_width, 0],
			
 
				+        #                       [img_crop_width, img_crop_height],
			
 
				+        #                       [0, img_crop_height]])
			
 
				+        # M = cv2.getPerspectiveTransform(points, pts_std)
			
 
				+        # dst_img = cv2.warpPerspective(
			
 
				+        #     img,
			
 
				+        #     M, (img_crop_width, img_crop_height),
			
 
				+        #     borderMode=cv2.BORDER_REPLICATE,
			
 
				+        #     flags=cv2.INTER_CUBIC)
			
 
				+        # print('dst_img.shape', dst_img.shape)
			
 
				+        #
			
 
				+        # print('points', points)
			
 
				+        w = abs(points[2][0] - points[0][0])
			
 
				+        h = abs(points[2][1] - points[0][1])
			
 
				+        dst_img = img[int(points[0][1]):int(points[0][1] + h), int(points[0][0]):int(points[0][0] + w), :]
			
 
				+        # print('dst_img.shape2', dst_img.shape)
			
 
				+        # cv2.imshow('dst_img', dst_img)
			
 
				+        # cv2.waitKey(0)
			
 
				+        # dst_img_height, dst_img_width = dst_img.shape[0:2]
			
 
				+        # # if dst_img_height * 1.0 / dst_img_width >= 1.5:
			
 
				+        # if dst_img_height * 1.0 / dst_img_width >= 2.0:
			
 
				+        #     dst_img = np.rot90(dst_img)
			
 
				         return dst_img
			
 
				 
			
 
				     def print_draw_crop_rec_res(self, img_crop_list, rec_res):
			
@@ -91,6 +102,7 @@ class TextSystem(object):
 
				             logger.info(bno, rec_res[bno])
			
 
				 
			
 
				     def __call__(self, img):
			
 
				+        # print('into TextSystem __call__')
			
 
				         # cv2.imshow('img',img)
			
 
				         # cv2.waitKey(0)
			
 
				         ori_im = img.copy()
			
@@ -98,15 +110,65 @@ class TextSystem(object):
 
				         logger.info("dt_boxes num : {}, elapse : {}".format(
			
 
				             len(dt_boxes), elapse))
			
 
				         if dt_boxes is None:
			
 
				-            return None, None
			
 
				-        img_crop_list = []
			
 
				+            return [], []
			
 
				 
			
 
				-        dt_boxes = sorted_boxes(dt_boxes)
			
 
				+        temp_list = []
			
 
				+        # print('dt_boxes', type(dt_boxes))
			
 
				+        # print('dt_boxes.shape', dt_boxes.shape)
			
 
				+        # 过滤一些比例离谱的box
			
 
				+        for b in dt_boxes:
			
 
				+            w = b[2][0] - b[0][0]
			
 
				+            h = b[2][1] - b[0][1]
			
 
				+            if h == 0 or w == 0 \
			
 
				+                    or h >= 10000 or w >= 10000 \
			
 
				+                    or w / h <= 0.5 or w / h >= 100:
			
 
				+                continue
			
 
				+            temp_list.append(b)
			
 
				+
			
 
				+        if not temp_list:
			
 
				+            return [], []
			
 
				+        dt_boxes = np.array(temp_list)
			
 
				+        # print('dt_boxes.shape2', dt_boxes.shape)
			
 
				+
			
 
				+        # show
			
 
				+        # for b in dt_boxes:
			
 
				+        #     p1 = [int(x) for x in b[0]]
			
 
				+        #     p2 = [int(x) for x in b[2]]
			
 
				+        #     cv2.rectangle(img, p1, p2, (0, 0, 255))
			
 
				+        # cv2.namedWindow('img', cv2.WINDOW_NORMAL)
			
 
				+        # cv2.imshow('img', img)
			
 
				+        # cv2.waitKey(0)
			
 
				+
			
 
				+        # # 检测过多单字box，返回None
			
 
				+        # if len(dt_boxes) >= 150:
			
 
				+        #     short_box_cnt = 0
			
 
				+        #     long_box_cnt = 0
			
 
				+        #     for b in dt_boxes:
			
 
				+        #         w = b[2][0] - b[0][0]
			
 
				+        #         h = b[2][1] - b[0][1]
			
 
				+        #         if w / h < 1.3:
			
 
				+        #             short_box_cnt += 1
			
 
				+        #         if w / h >= 3:
			
 
				+        #             long_box_cnt += 1
			
 
				+        #         print('dt_boxes', w, h, round(w/h, 3))
			
 
				+        #     # print('short_box_cnt, len(dt_boxes)', short_box_cnt, len(dt_boxes))
			
 
				+        #     log('short_box_cnt, long_box_cnt, len(dt_boxes) ' + str([short_box_cnt, long_box_cnt, len(dt_boxes)]))
			
 
				+        #     if short_box_cnt >= 2/3 * len(dt_boxes) and long_box_cnt < 10:
			
 
				+        #         # print('short_box_cnt >= 2/3 * len(dt_boxes), return None')
			
 
				+        #         log('short_box_cnt >= 2/3 * len(dt_boxes), return None. ' + str([short_box_cnt, long_box_cnt, len(dt_boxes)]))
			
 
				+        #         return [], []
			
 
				 
			
 
				+        img_crop_list = []
			
 
				+        dt_boxes = sorted_boxes(dt_boxes)
			
 
				         for bno in range(len(dt_boxes)):
			
 
				             tmp_box = copy.deepcopy(dt_boxes[bno])
			
 
				             img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
			
 
				             img_crop_list.append(img_crop)
			
 
				+        # print('system len(img_crop_list)', len(img_crop_list))
			
 
				+        # for img in img_crop_list:
			
 
				+        #     if img.shape[1] / img.shape[0] <= 0.5:
			
 
				+        # print('system img.shape[1] / img.shape[0] <= 0.5', img.shape)
			
 
				+
			
 
				         if self.use_angle_cls:
			
 
				             img_crop_list, angle_list, elapse = self.text_classifier(
			
 
				                 img_crop_list)
			
@@ -131,6 +193,7 @@ class TextSystem(object):
 
				                 filter_rec_res.append(rec_reuslt)
			
 
				         return filter_boxes, filter_rec_res
			
 
				 
			
 
				+
			
 
				 def boxex_points_fixup(dt_boxes):
			
 
				     # 检查框全部转换为矩形
			
 
				     # for i in range(len(dt_boxes)):
			
@@ -143,39 +206,37 @@ def boxex_points_fixup(dt_boxes):
 
				     #     y_min = min(y_list)
			
 
				     #     dt_boxes[i] = np.array([[x_min,y_min],[x_max,y_min],[x_max,y_max],[x_min,y_max]])
			
 
				 
			
 
				-
			
 
				     for i in range(len(dt_boxes)):
			
 
				         box1 = dt_boxes[i]
			
 
				         box1_point3 = box1[2]
			
 
				-        box1_point4 = box1[3] # 四边形底边的两点坐标
			
 
				-        bottom_line = (min(box1_point3[0],box1_point4[0]),max(box1_point3[0],box1_point4[0]))
			
 
				-        bottom_line_len = abs(bottom_line[1]-bottom_line[0])
			
 
				+        box1_point4 = box1[3]  # 四边形底边的两点坐标
			
 
				+        bottom_line = (min(box1_point3[0], box1_point4[0]), max(box1_point3[0], box1_point4[0]))
			
 
				+        bottom_line_len = abs(bottom_line[1] - bottom_line[0])
			
 
				 
			
 
				-        for j in range(i+1,len(dt_boxes)):
			
 
				+        for j in range(i + 1, len(dt_boxes)):
			
 
				             box2 = dt_boxes[j]
			
 
				             box2_point1 = box2[0]
			
 
				-            box2_point2 = box2[1] # 四边形顶边的两点坐标
			
 
				+            box2_point2 = box2[1]  # 四边形顶边的两点坐标
			
 
				             top_line = (min(box2_point1[0], box2_point2[0]), max(box2_point1[0], box2_point2[0]))
			
 
				-            top_line_len = abs(top_line[1]-top_line[0])
			
 
				+            top_line_len = abs(top_line[1] - top_line[0])
			
 
				             if has_intersection(box1, box2):  # 四边形框是否有交集
			
 
				-                if not (min(top_line)>=max(bottom_line) or min(bottom_line)>=max(top_line)):  # x轴方向上有交集
			
 
				+                if not (min(top_line) >= max(bottom_line) or min(bottom_line) >= max(top_line)):  # x轴方向上有交集
			
 
				                     # 求重合部分y中间值
			
 
				                     mid_y = ((box2_point1[1] + box2_point2[1]) / 2 + (box1_point3[1] + box1_point4[1]) / 2) // 2
			
 
				                     if not mid_y:
			
 
				                         continue
			
 
				-                    max_line_len = max(bottom_line_len,top_line_len)
			
 
				+                    max_line_len = max(bottom_line_len, top_line_len)
			
 
				                     cross_line_len = bottom_line_len + top_line_len - \
			
 
				-                                     (max(bottom_line[1],bottom_line[0],top_line[1],top_line[0]) - min(bottom_line[1],bottom_line[0],top_line[1],top_line[0]))
			
 
				+                                     (max(bottom_line[1], bottom_line[0], top_line[1], top_line[0]) - min(
			
 
				+                                         bottom_line[1], bottom_line[0], top_line[1], top_line[0]))
			
 
				                     # print(cross_line_len,max_line_len,cross_line_len/max_line_len)
			
 
				-                    if cross_line_len/max_line_len>=0.55: # 重合比例
			
 
				-                        box1[2] = [box1_point3[0],mid_y]
			
 
				-                        box1[3] = [box1_point4[0],mid_y]
			
 
				-                        box2[0] = [box2_point1[0],mid_y]
			
 
				-                        box2[1] = [box2_point2[0],mid_y]
			
 
				+                    if cross_line_len / max_line_len >= 0.55:  # 重合比例
			
 
				+                        box1[2] = [box1_point3[0], mid_y]
			
 
				+                        box1[3] = [box1_point4[0], mid_y]
			
 
				+                        box2[0] = [box2_point1[0], mid_y]
			
 
				+                        box2[1] = [box2_point2[0], mid_y]
			
 
				                         break
			
 
				 
			
 
				-
			
 
				-
			
 
				     return dt_boxes
			
 
				 
			
 
				 
			
@@ -247,4 +308,4 @@ def main(args):
 
				 if __name__ == "__main__":
			
 
				     main(utility.parse_args())
			
 
				 
			
 
				-    pass
			
 
				+    pass
			
--- a/start_and_stop/kill_convert.sh
+++ b/start_and_stop/kill_convert.sh
@@ -0,0 +1 @@
 
				+kill -9 $(lsof -i:15010|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
			
--- a/tika_/tika_interface.py
+++ b/tika_/tika_interface.py
@@ -1,3 +1,5 @@
 
				+import base64
			
 
				+import io
			
 
				 import json
			
 
				 import os
			
 
				 import re
			
@@ -7,8 +9,11 @@ import traceback
 
				 from glob import glob
			
 
				 
			
 
				 import psutil
			
 
				+from PIL import Image
			
 
				+from bs4 import BeautifulSoup
			
 
				 
			
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
 
				+from config.max_compute_config import MAX_COMPUTE
			
 
				 _dir = os.path.abspath(os.path.dirname(__file__))
			
 
				 os.environ["TIKA_SERVER_JAR"] = _dir + "/files/tika-server.jar"
			
 
				 os.environ["TIKA_LOG_PATH"] = _dir + "/log/"
			
@@ -16,12 +21,19 @@ os.environ["TIKA_PATH"] = _dir + "/files/"
 
				 os.environ["TIKA_LOG_FILE"] = "tika.log"
			
 
				 
			
 
				 from format_convert import _global
			
 
				-from format_convert.utils import log, request_post, dynamic_get_port
			
 
				+from format_convert.utils import log, request_post, dynamic_get_port, get_platform
			
 
				 import tika
			
 
				 from tika import parser, config
			
 
				 from tika.tika import runCommand
			
 
				 from flask import Flask, request
			
 
				 
			
 
				+if get_platform() == "Windows":
			
 
				+    FROM_REMOTE = False
			
 
				+else:
			
 
				+    FROM_REMOTE = True
			
 
				+
			
 
				+if MAX_COMPUTE:
			
 
				+    FROM_REMOTE = False
			
 
				 
			
 
				 # 接口配置
			
 
				 app = Flask(__name__)
			
@@ -46,18 +58,18 @@ def _tika():
 
				         _md5 = request.form.get("md5")
			
 
				         _global.update({"md5": _md5})
			
 
				 
			
 
				-        html = tika_interface(data).get('html')
			
 
				-        return json.dumps({"html": html})
			
 
				+        html = tika_interface(data).get('data')
			
 
				+        return json.dumps({"data": html})
			
 
				     except TimeoutError:
			
 
				-        return json.dumps({"html": [-5]})
			
 
				+        return json.dumps({"data": [-5]})
			
 
				     except:
			
 
				         traceback.print_exc()
			
 
				-        return json.dumps({"html": [-1]})
			
 
				+        return json.dumps({"data": [-1]})
			
 
				     finally:
			
 
				         log("tika interface finish time " + str(time.time()-start_time))
			
 
				 
			
 
				 
			
 
				-def tika_interface(_path, show=1):
			
 
				+def tika_interface(_path, show=0):
			
 
				     try:
			
 
				         # apache tika服务器 提取
			
 
				         # text = runCommand('parse', 'all', _path, '9998', outDir='./files/')
			
@@ -67,7 +79,8 @@ def tika_interface(_path, show=1):
 
				         if globals().get(key):
			
 
				             port = globals().get(key)
			
 
				         else:
			
 
				-            port = dynamic_get_port(port)
			
 
				+            if FROM_REMOTE:
			
 
				+                port = dynamic_get_port(port)
			
 
				             if port is None:
			
 
				                 kill_tika_java_server()
			
 
				                 # return {"html": [-19]}
			
@@ -76,31 +89,104 @@ def tika_interface(_path, show=1):
 
				         url = 'http://localhost:' + str(port)
			
 
				         log('tika ' + key + ' port: ' + str(port))
			
 
				         parsed = parser.from_file(_path, xmlContent=True, serverEndpoint=url)
			
 
				-        html = parsed.get('content')
			
 
				-
			
 
				-        # 处理html
			
 
				-        html = html.split('\n')
			
 
				-        temp_list = []
			
 
				-        for line in html:
			
 
				-            if '<meta' in line:
			
 
				-                continue
			
 
				-            temp_list.append(line)
			
 
				-        html = temp_list
			
 
				-        if len(html) <= 4:
			
 
				-            return {"html": ''}
			
 
				-
			
 
				-        html = html[:2] + ['<meta charset="UTF-8">'] + html[2:]
			
 
				-        html = '\n'.join(html)
			
 
				-        html = re.sub('<table>', '<table border="1">', html)
			
 
				-        html = re.sub(' class="正文"', '', html)
			
 
				+        # print('parsed', parsed)
			
 
				+        html = parsed.get('content', '')
			
 
				 
			
 
				+        # 提取html各种元素，其中图片只是一个映射
			
 
				+        soup = BeautifulSoup(html, 'lxml')
			
 
				+        tag_list = collect_soup_elements(soup)
			
 
				         if show:
			
 
				-            with open(_dir + '/doc.html', 'w', encoding='utf-8') as f:
			
 
				-                f.write(html)
			
 
				+            print('tag_list0', tag_list)
			
 
				+
			
 
				+        if not tag_list:
			
 
				+            return {"data": tag_list}
			
 
				+
			
 
				+        # docx不是二进制，不能直接读二进制图片
			
 
				+        if _path[-3:] == 'doc':
			
 
				+            # 直接从二进制提取图片，保存在同一目录下
			
 
				+            ss = re.split('[/\\\]', _path)
			
 
				+            save_dir = os.sep.join(ss[:-1])
			
 
				+            file_name = re.split('\.', ss[-1])[0]
			
 
				+            if show:
			
 
				+                print('save_dir', save_dir)
			
 
				+                print('file_name', file_name)
			
 
				+            image_path_dict = extract_images_from_doc(_path, save_dir)
			
 
				+
			
 
				+            if show:
			
 
				+                print('image_path_dict', image_path_dict)
			
 
				+
			
 
				+            # embedded_images = re.findall(r'embedded:image[^"]+', html)
			
 
				+            match_flag = 1
			
 
				+            for tag in tag_list:
			
 
				+                tag_name, value = tag
			
 
				+                if tag_name != 'img':
			
 
				+                    continue
			
 
				+                # 提取图片文件名
			
 
				+                image_name = file_name + '_' + re.sub('image', '', value)
			
 
				+                if show:
			
 
				+                    print('image_name', image_name)
			
 
				+                # 保证所有image映射都对得上
			
 
				+                real_image_path = image_path_dict.get(image_name)
			
 
				+                if real_image_path is None:
			
 
				+                    match_flag = 0
			
 
				+                    break
			
 
				+                else:
			
 
				+                    tag[1] = real_image_path
			
 
				+            if show:
			
 
				+                print('match_flag', match_flag)
			
 
				+
			
 
				+            if match_flag:
			
 
				+                # 图片数量能对上，则是正确的
			
 
				+                pass
			
 
				+            else:
			
 
				+                # 图片对不上，则删除所有图片类型的tag
			
 
				+                temp_list = []
			
 
				+                for tag_name, value in tag_list:
			
 
				+                    if tag_name == 'img':
			
 
				+                        continue
			
 
				+                    temp_list.append([tag_name, value])
			
 
				+                tag_list = temp_list
			
 
				+
			
 
				+        elif _path[-4:] == 'docx':
			
 
				+            temp_list = []
			
 
				+            for tag_name, value in tag_list:
			
 
				+                if tag_name == 'img':
			
 
				+                    continue
			
 
				+                temp_list.append([tag_name, value])
			
 
				+            tag_list = temp_list
			
 
				+
			
 
				+
			
 
				+        # # 处理html
			
 
				+        # html = html.split('\n')
			
 
				+        # temp_list = []
			
 
				+        # for line in html:
			
 
				+        #     if '<meta' in line:
			
 
				+        #         continue
			
 
				+        #     temp_list.append(line)
			
 
				+        # html = temp_list
			
 
				+        # if len(html) <= 4:
			
 
				+        #     return {"html": ''}
			
 
				+        #
			
 
				+        # html = html[:2] + ['<meta charset="UTF-8">'] + html[2:]
			
 
				+        # html = '\n'.join(html)
			
 
				+        # html = re.sub('<table>', '<table border="1">', html)
			
 
				+        # html = re.sub(' class="正文"', '', html)
			
 
				+        #
			
 
				+        # if show:
			
 
				+        #     with open(_dir + '/doc.html', 'w', encoding='utf-8') as f:
			
 
				+        #         f.write(html)
			
 
				+    # except:
			
 
				+    #     traceback.print_exc()
			
 
				+    #     return {"html": [-17]}
			
 
				+    # return {"html": html}
			
 
				+
			
 
				+        if show:
			
 
				+            print('tag_list final', tag_list)
			
 
				+
			
 
				     except:
			
 
				         traceback.print_exc()
			
 
				-        return {"html": [-17]}
			
 
				-    return {"html": html}
			
 
				+        return {"data": [-17]}
			
 
				+    return {"data": tag_list}
			
 
				 
			
 
				 
			
 
				 def kill_tika_java_server():
			
@@ -122,6 +208,139 @@ def kill_tika_java_server():
 
				             os.system(comm)
			
 
				 
			
 
				 
			
 
				+def extract_images_from_doc(doc_file_path, output_folder):
			
 
				+    # 定义图片格式相关的标志
			
 
				+    image_signatures = {
			
 
				+        'jpg': (b'\xFF\xD8', b'\xFF\xD9'),
			
 
				+        'png': (b'\x89PNG', b'\x49\x45\x4E\x44\xAE\x42\x60\x82')
			
 
				+    }
			
 
				+
			
 
				+    file_name = re.split('[/\\\.]', doc_file_path)[-2]
			
 
				+
			
 
				+    # 读取.doc文件
			
 
				+    with open(doc_file_path, 'rb') as doc_file:
			
 
				+        doc_data = doc_file.read()
			
 
				+
			
 
				+    output_file_path_dict = {}
			
 
				+    # 查找并提取所有图片
			
 
				+    for img_format, (start_sig, end_sig) in image_signatures.items():
			
 
				+        start_index = 0
			
 
				+        image_count = 1
			
 
				+        while True:
			
 
				+            # 查找图片起始位置
			
 
				+            start_index = doc_data.find(start_sig, start_index)
			
 
				+            if start_index == -1:
			
 
				+                break
			
 
				+
			
 
				+            # 查找图片结束位置
			
 
				+            end_index = doc_data.find(end_sig, start_index)
			
 
				+            if end_index == -1:
			
 
				+                break
			
 
				+
			
 
				+            # 提取图片数据
			
 
				+            end_index += len(end_sig)  # 包含结束标志
			
 
				+            image_data = doc_data[start_index:end_index]
			
 
				+
			
 
				+            # 保存图片
			
 
				+            # image_count = len([f for f in os.listdir(output_folder) if f.endswith(f'.{img_format}')])
			
 
				+            image_name = f'{file_name}_{image_count}.{img_format}'
			
 
				+            image_path = os.path.join(output_folder, image_name)
			
 
				+            with open(image_path, 'wb') as img_file:
			
 
				+                img_file.write(image_data)
			
 
				+            print(f'Saved {img_format} image to {image_path}')
			
 
				+            output_file_path_dict[image_name] = image_path
			
 
				+
			
 
				+            # 继续查找下一个图片
			
 
				+            start_index = end_index
			
 
				+            image_count += 1
			
 
				+    return output_file_path_dict
			
 
				+
			
 
				+
			
 
				+def is_image_valid(image_path):
			
 
				+    try:
			
 
				+        # 尝试打开图片
			
 
				+        with Image.open(image_path) as img:
			
 
				+            # 如果图片可以打开并且没有问题，则 True返回
			
 
				+            img.load()
			
 
				+            return True
			
 
				+    except:
			
 
				+        # 如果出现异常，则返回 False
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def is_image_data_valid(image_data):
			
 
				+    """
			
 
				+    判断图片数据流是否可以正常打开
			
 
				+
			
 
				+    Args:
			
 
				+        image_data (bytes): 图片数据流
			
 
				+
			
 
				+    Returns:
			
 
				+        bool: 如果图片数据流可以正常打开，则返回True，否则返回False
			
 
				+    """
			
 
				+    try:
			
 
				+        # 将图片数据流转换为文件类对象
			
 
				+        image_file = io.BytesIO(image_data)
			
 
				+        # 尝试打开图片
			
 
				+        with Image.open(image_file) as img:
			
 
				+            # 如果图片可以打开并且没有问题，则返回True
			
 
				+            img.load()
			
 
				+            return True
			
 
				+    except:
			
 
				+        # 如果出现异常，则返回False
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def collect_soup_elements(soup):
			
 
				+    # elements = []
			
 
				+    # # print('tags', tags)
			
 
				+    # for tag in tags:
			
 
				+    #     for element in tag.children:
			
 
				+    #         print('element', element)
			
 
				+    #         if element.name == 'img':
			
 
				+    #             # 提取<img>标签的alt属性
			
 
				+    #             alt_value = element.get('alt')
			
 
				+    #             print(f"Image: {alt_value}")
			
 
				+    #             elements.append(['img', alt_value])
			
 
				+    #         elif element.name == 'table':
			
 
				+    #             elements.append(['table', element])
			
 
				+    #         elif element.string and element.string.strip():
			
 
				+    #             # 提取文本内容
			
 
				+    #             text = element.string.strip()
			
 
				+    #             print(f"Text: {text}")
			
 
				+    #             elements.append(['text', text])
			
 
				+
			
 
				+    table_tags = soup.find_all('table')
			
 
				+    for table in table_tags:
			
 
				+        table['border'] = "1"
			
 
				+
			
 
				+    elements = []
			
 
				+    # 遍历所有标签
			
 
				+    for element in soup.body.descendants:
			
 
				+        if element.name == 'p':
			
 
				+            # 提取文本
			
 
				+            text = element.get_text(strip=True)
			
 
				+            if text:
			
 
				+                elements.append(['text', text])
			
 
				+        elif element.name == 'img':
			
 
				+            # 提取图片alt
			
 
				+            alt = element.get('alt')
			
 
				+            elements.append(['img', alt])
			
 
				+        elif element.name == 'table':
			
 
				+            # 提取表格数据
			
 
				+            # table_data = []
			
 
				+            # for row in element.find_all('tr'):
			
 
				+            #     row_data = []
			
 
				+            #     for cell in row.find_all('td'):
			
 
				+            #         cell_text = cell.get_text(strip=True)
			
 
				+            #         row_data.append(cell_text)
			
 
				+            #     table_data.append(row_data)
			
 
				+            for p_tag in element.find_all('p'):
			
 
				+                p_tag.unwrap()
			
 
				+            elements.append(['table', str(element)])
			
 
				+    return elements
			
 
				+
			
 
				+
			
 
				 def test_interface():
			
 
				     # paths = glob("C:/Users/Administrator/Downloads/1716253106319.doc")
			
 
				     paths = ["files/1716253106319.doc"]
			
@@ -153,6 +372,13 @@ if __name__ == "__main__":
 
				     #     # _p = "C:/Users/Administrator/Downloads/1716253106319.doc"
			
 
				     #     tika_interface(_p)
			
 
				 
			
 
				-    # app.run(host='0.0.0.0', port=5000)
			
 
				+    # app.run(host='0.0.0.0', port=16050)
			
 
				     # test_interface()
			
 
				-    kill_tika_java_server()
			
 
				+    # kill_tika_java_server()
			
 
				+
			
 
				+    # p = "C:/Users/Administrator/Desktop/test_wps/error1.wps"
			
 
				+    # extract_images_from_doc(p, '.')
			
 
				+
			
 
				+    _p = "C:/Users/Administrator/Desktop/test_wps/error1.wps"
			
 
				+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
			
 
				+    c = tika_interface(_p)
		`@@ -0,0 +1 @@`
		`+kill -9 $(lsof -i:15010\|sed -n '2,$p'\|awk '{print $2}'\|tr '\n' ' ')`